File size: 6,289 Bytes
3dac39e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | #!/bin/bash
# =============================================================================
# Round 9: 5-class R9 dataset training
#
# R9 data:
# - R8 strict/deleaked train
# - CyberNER_harmonized deleaked + OPF span-format normalized
# - DNRTI deleaked
# - Prefix-80 deduplicated
#
# Main change vs R8:
# - O-downsample lowered from 0.7 to 0.3. R8 improved recall with 0.7, but
# overpredicted Indicator; R9 has better Org/System coverage and should not
# need such an aggressive O-token loss mask.
#
# Defaults target RTX PRO 6000 96GB. Override BATCH_SIZE/GRAD_ACCUM_STEPS for
# smaller GPUs, e.g. BATCH_SIZE=1 GRAD_ACCUM_STEPS=8 on a 32GB RTX 5090.
# =============================================================================
set -euo pipefail
cd ~/alkyline
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
DATA="${DATA:-data/processed}"
LABELS="${LABELS:-data/label_spaces}"
RESULTS="${RESULTS:-results}"
PYTHON="${PYTHON:-$PWD/.venv/bin/python}"
if [ ! -x "$PYTHON" ]; then
PYTHON="$(command -v python3)"
fi
OPF=("$PYTHON" -m opf)
PATIENCE="${PATIENCE:-3}"
BATCH_SIZE="${BATCH_SIZE:-4}"
GRAD_ACCUM_STEPS="${GRAD_ACCUM_STEPS:-2}"
EPOCHS="${EPOCHS:-15}"
LEARNING_RATE="${LEARNING_RATE:-5e-5}"
O_DOWNSAMPLE="${O_DOWNSAMPLE:-0.3}"
DEVICE="${DEVICE:-cuda}"
TRAIN_LOG="${TRAIN_LOG:-train_r9.log}"
CKPT_DIR="${CKPT_DIR:-checkpoints/r9_5class}"
mkdir -p "$RESULTS"
echo "================================================================"
echo " R9 PRE-FLIGHT"
echo "================================================================"
"$PYTHON" scripts/build_r9_dataset.py 2>&1 | tee build_r9.log
"$PYTHON" scripts/audit_r9_readiness.py \
--json-out "$RESULTS/r9_readiness_audit.json" \
--md-out "$RESULTS/r9_readiness_audit.md" \
2>&1 | tee audit_r9_readiness.log
echo "================================================================"
echo " R9: strict merged 5-class training"
echo "================================================================"
echo "Start time: $(date)"
echo "Train data: $DATA/r9_5class_train.jsonl"
echo "Validation data: $DATA/r9_5class_valid.jsonl"
echo "Checkpoint dir: $CKPT_DIR"
echo "Batch/accum: $BATCH_SIZE / $GRAD_ACCUM_STEPS"
echo "Epochs/patience: $EPOCHS / $PATIENCE"
echo "LR: $LEARNING_RATE"
echo "O-downsample: $O_DOWNSAMPLE"
echo "Device: $DEVICE"
echo "================================================================"
"${OPF[@]}" train "$DATA/r9_5class_train.jsonl" \
--validation-dataset "$DATA/r9_5class_valid.jsonl" \
--label-space-json "$LABELS/cyner_5class.json" \
--output-dir "$CKPT_DIR" \
--overwrite-output \
--epochs "$EPOCHS" --batch-size "$BATCH_SIZE" --grad-accum-steps "$GRAD_ACCUM_STEPS" \
--learning-rate "$LEARNING_RATE" \
--warmup-fraction 0.1 --lr-schedule cosine \
--loss-fn focal --focal-gamma 2.0 \
--llrd-factor 0.9 \
--o-downsample "$O_DOWNSAMPLE" \
--device "$DEVICE" 2>&1 | tee "$TRAIN_LOG" &
TRAIN_PID=$!
bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
MONITOR_PID=$!
wait "$TRAIN_PID" 2>/dev/null || true
kill "$MONITOR_PID" 2>/dev/null || true
echo "Training finished: $(date)"
BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
| awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
| sort -n | head -1 | awk '{print $2}')
echo "Best epoch by val_loss: $BEST_EPOCH"
CKPT="$CKPT_DIR"
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
echo "Training killed early - using epoch $BEST_EPOCH checkpoint"
CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
fi
echo "Selected checkpoint: $CKPT"
EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device $DEVICE"
echo ""
echo "================================================================"
echo " EVALUATION PHASE"
echo "================================================================"
echo ""
echo "===== Eval R9: Enriched 5-class test ====="
"${OPF[@]}" eval "$DATA/enriched_5class_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r9_enriched.log
echo ""
echo "===== Eval R9: CyNER test ====="
"${OPF[@]}" eval "$DATA/cyner_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r9_cyner.log
echo ""
echo "===== Eval R9: SecureBERT2 5-class test ====="
"${OPF[@]}" eval "$DATA/securebert2_5class_test.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r9_sb2.log
echo ""
echo "===== Eval R9: APTNER 5-class independent test ====="
"${OPF[@]}" eval "$DATA/aptner_5class_test_clean.jsonl" \
$EVAL_FLAGS 2>&1 | tee eval_r9_aptner.log
echo ""
echo "===== Viterbi Grid Search (R9 validation) ====="
"$PYTHON" scripts/viterbi_grid_search.py \
--checkpoint "$CKPT" \
--val-data "$DATA/r9_5class_valid.jsonl" \
--output "$RESULTS/viterbi_r9_best.json" \
--device "$DEVICE" 2>&1 | tee viterbi_r9.log
echo ""
echo "===== Exact-Match Eval R9: CyNER test ====="
"$PYTHON" scripts/eval_exact_match.py \
--checkpoint "$CKPT" \
--test-data "$DATA/cyner_test.jsonl" \
--device "$DEVICE" \
--decode-mode viterbi \
--json-out "$RESULTS/r9_cyner_exact_match.json" \
2>&1 | tee exact_r9_cyner.log
echo ""
echo "===== Exact-Match Eval R9: APTNER independent test ====="
"$PYTHON" scripts/eval_exact_match.py \
--checkpoint "$CKPT" \
--test-data "$DATA/aptner_5class_test_clean.jsonl" \
--device "$DEVICE" \
--decode-mode viterbi \
--json-out "$RESULTS/r9_aptner_exact_match.json" \
2>&1 | tee exact_r9_aptner.log
echo ""
echo "================================================================"
echo " R9 COMPLETE - $(date)"
echo "================================================================"
echo "Checkpoint: $CKPT"
echo "Logs: $TRAIN_LOG, eval_r9_{enriched,cyner,sb2,aptner}.log, exact_r9_{cyner,aptner}.log, viterbi_r9.log"
echo ""
echo "--- Enriched ---"
grep -E '(micro|macro|^ )' eval_r9_enriched.log 2>/dev/null | head -20 || true
echo ""
echo "--- APTNER independent ---"
grep -E '(micro|macro|^ )' eval_r9_aptner.log 2>/dev/null | head -20 || true
echo "================================================================"
|