arcspan / scripts /run_train_v8.sh
chairulridjal's picture
Add files using upload-large-folder tool
3dac39e verified
#!/bin/bash
# =============================================================================
# Round 8: FINAL training run on DELEAKED 5-class cybersecurity NER data
#
# Key differences from R6:
# - Data: r8_5class_{train,valid}.jsonl — fully deleaked (no train/test overlap)
# - O-token downsampling: --o-downsample 0.7 (mask 70% of O-tokens from loss)
# This should boost entity recall given ~85-90% O-token prevalence
# - Four evaluation test sets (incl. APTNER zero-leakage independent benchmark)
# - Same proven hyperparams: focal(γ=2), cosine LR, LLRD 0.9, LR 5e-5
#
# Expect slightly worse val_loss vs R6 (honest eval on clean data).
# GPU: RTX PRO 6000 96GB VRAM
# =============================================================================
set -euo pipefail
cd ~/alkyline
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
PATIENCE=3
DATA=data/processed
LABELS=data/label_spaces
TRAIN_LOG=train_r8.log
CKPT_DIR=checkpoints/r8_5class
# ---------------------------------------------------------------------------
# Training
# ---------------------------------------------------------------------------
echo "===== R8: Deleaked 5-class (focal + cosine + LLRD + O-downsample) ====="
echo "Start time: $(date)"
opf train "$DATA/r8_5class_train.jsonl" \
--validation-dataset "$DATA/r8_5class_valid.jsonl" \
--label-space-json "$LABELS/cyner_5class.json" \
--output-dir "$CKPT_DIR" \
--overwrite-output \
--epochs 15 --batch-size 4 --grad-accum-steps 2 \
--learning-rate 5e-5 \
--warmup-fraction 0.1 --lr-schedule cosine \
--loss-fn focal --focal-gamma 2.0 \
--llrd-factor 0.9 \
--o-downsample 0.7 \
--device cuda 2>&1 | tee "$TRAIN_LOG" &
TRAIN_PID=$!
# Early stopping monitor — kills training if val_loss hasn't improved for $PATIENCE epochs
bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
MONITOR_PID=$!
wait $TRAIN_PID 2>/dev/null || true
kill $MONITOR_PID 2>/dev/null || true
echo "Training finished: $(date)"
# ---------------------------------------------------------------------------
# Best-epoch checkpoint selection
#
# The trainer saves per-epoch checkpoints as epoch_N/ and a final model.
# If early stopping killed training, the final model.safetensors may not
# exist. In that case, find the epoch with the lowest val_loss and use it.
# ---------------------------------------------------------------------------
BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
| awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
| sort -n | head -1 | awk '{print $2}')
echo "Best epoch by val_loss: $BEST_EPOCH"
CKPT="$CKPT_DIR"
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
# Even if training ran to completion, prefer the best-epoch checkpoint
echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
fi
echo "Selected checkpoint: $CKPT"
# ---------------------------------------------------------------------------
# Evaluation — four test sets
# ---------------------------------------------------------------------------
EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device cuda"
echo ""
echo "================================================================"
echo " EVALUATION PHASE"
echo "================================================================"
# 1) Primary benchmark: enriched 5-class test
echo ""
echo "===== Eval R8: Enriched 5-class test (primary benchmark) ====="
opf eval "$DATA/enriched_5class_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_enriched.log
# 2) CyNER test (supplementary — 98% overlap with enriched)
echo ""
echo "===== Eval R8: CyNER test (supplementary, ~98% overlap) ====="
opf eval "$DATA/cyner_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_cyner.log
# 3) SecureBERT2 test (supplementary — 96% overlap)
echo ""
echo "===== Eval R8: SecureBERT2 5-class test (supplementary, ~96% overlap) ====="
opf eval "$DATA/securebert2_5class_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_sb2.log
# 4) APTNER — independent benchmark (ZERO leakage, most important)
echo ""
echo "===== Eval R8: APTNER 5-class test (INDEPENDENT — zero leakage) ====="
opf eval "$DATA/aptner_5class_test_clean.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r8_aptner.log
# ---------------------------------------------------------------------------
# Viterbi grid search on validation set
# Find optimal Viterbi transition penalties for inference
# ---------------------------------------------------------------------------
echo ""
echo "===== Viterbi Grid Search (on validation set) ====="
python3 scripts/viterbi_grid_search.py \
--checkpoint "$CKPT" \
--val-data "$DATA/r8_5class_valid.jsonl" \
--output results/viterbi_r8_best.json \
--device cuda 2>&1 | tee viterbi_r8.log
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "================================================================"
echo " R8 COMPLETE — $(date)"
echo "================================================================"
echo "Checkpoint: $CKPT"
echo "Logs: $TRAIN_LOG, eval_r8_{enriched,cyner,sb2,aptner}.log, viterbi_r8.log"
echo ""
echo "Key results:"
echo "--- Enriched (primary) ---"
grep -E '(micro|macro|^ )' eval_r8_enriched.log 2>/dev/null | head -20 || true
echo ""
echo "--- APTNER (independent) ---"
grep -E '(micro|macro|^ )' eval_r8_aptner.log 2>/dev/null | head -20 || true
echo "================================================================"