File size: 6,289 Bytes
3dac39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/bin/bash
# =============================================================================
# Round 9: 5-class R9 dataset training
#
# R9 data:
#   - R8 strict/deleaked train
#   - CyberNER_harmonized deleaked + OPF span-format normalized
#   - DNRTI deleaked
#   - Prefix-80 deduplicated
#
# Main change vs R8:
#   - O-downsample lowered from 0.7 to 0.3. R8 improved recall with 0.7, but
#     overpredicted Indicator; R9 has better Org/System coverage and should not
#     need such an aggressive O-token loss mask.
#
# Defaults target RTX PRO 6000 96GB. Override BATCH_SIZE/GRAD_ACCUM_STEPS for
# smaller GPUs, e.g. BATCH_SIZE=1 GRAD_ACCUM_STEPS=8 on a 32GB RTX 5090.
# =============================================================================
set -euo pipefail
cd ~/alkyline

export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"

DATA="${DATA:-data/processed}"
LABELS="${LABELS:-data/label_spaces}"
RESULTS="${RESULTS:-results}"

PYTHON="${PYTHON:-$PWD/.venv/bin/python}"
if [ ! -x "$PYTHON" ]; then
  PYTHON="$(command -v python3)"
fi
OPF=("$PYTHON" -m opf)

PATIENCE="${PATIENCE:-3}"
BATCH_SIZE="${BATCH_SIZE:-4}"
GRAD_ACCUM_STEPS="${GRAD_ACCUM_STEPS:-2}"
EPOCHS="${EPOCHS:-15}"
LEARNING_RATE="${LEARNING_RATE:-5e-5}"
O_DOWNSAMPLE="${O_DOWNSAMPLE:-0.3}"
DEVICE="${DEVICE:-cuda}"

TRAIN_LOG="${TRAIN_LOG:-train_r9.log}"
CKPT_DIR="${CKPT_DIR:-checkpoints/r9_5class}"

mkdir -p "$RESULTS"

echo "================================================================"
echo "  R9 PRE-FLIGHT"
echo "================================================================"
"$PYTHON" scripts/build_r9_dataset.py 2>&1 | tee build_r9.log
"$PYTHON" scripts/audit_r9_readiness.py \
  --json-out "$RESULTS/r9_readiness_audit.json" \
  --md-out "$RESULTS/r9_readiness_audit.md" \
  2>&1 | tee audit_r9_readiness.log

echo "================================================================"
echo "  R9: strict merged 5-class training"
echo "================================================================"
echo "Start time:        $(date)"
echo "Train data:        $DATA/r9_5class_train.jsonl"
echo "Validation data:   $DATA/r9_5class_valid.jsonl"
echo "Checkpoint dir:    $CKPT_DIR"
echo "Batch/accum:       $BATCH_SIZE / $GRAD_ACCUM_STEPS"
echo "Epochs/patience:   $EPOCHS / $PATIENCE"
echo "LR:                $LEARNING_RATE"
echo "O-downsample:      $O_DOWNSAMPLE"
echo "Device:            $DEVICE"
echo "================================================================"

"${OPF[@]}" train "$DATA/r9_5class_train.jsonl" \
  --validation-dataset "$DATA/r9_5class_valid.jsonl" \
  --label-space-json "$LABELS/cyner_5class.json" \
  --output-dir "$CKPT_DIR" \
  --overwrite-output \
  --epochs "$EPOCHS" --batch-size "$BATCH_SIZE" --grad-accum-steps "$GRAD_ACCUM_STEPS" \
  --learning-rate "$LEARNING_RATE" \
  --warmup-fraction 0.1 --lr-schedule cosine \
  --loss-fn focal --focal-gamma 2.0 \
  --llrd-factor 0.9 \
  --o-downsample "$O_DOWNSAMPLE" \
  --device "$DEVICE" 2>&1 | tee "$TRAIN_LOG" &
TRAIN_PID=$!

bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
MONITOR_PID=$!

wait "$TRAIN_PID" 2>/dev/null || true
kill "$MONITOR_PID" 2>/dev/null || true

echo "Training finished: $(date)"

BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
  | awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
  | sort -n | head -1 | awk '{print $2}')
echo "Best epoch by val_loss: $BEST_EPOCH"

CKPT="$CKPT_DIR"
if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
  echo "Training killed early - using epoch $BEST_EPOCH checkpoint"
  CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
  echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
  CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
fi
echo "Selected checkpoint: $CKPT"

EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device $DEVICE"

echo ""
echo "================================================================"
echo "  EVALUATION PHASE"
echo "================================================================"

echo ""
echo "===== Eval R9: Enriched 5-class test ====="
"${OPF[@]}" eval "$DATA/enriched_5class_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r9_enriched.log

echo ""
echo "===== Eval R9: CyNER test ====="
"${OPF[@]}" eval "$DATA/cyner_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r9_cyner.log

echo ""
echo "===== Eval R9: SecureBERT2 5-class test ====="
"${OPF[@]}" eval "$DATA/securebert2_5class_test.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r9_sb2.log

echo ""
echo "===== Eval R9: APTNER 5-class independent test ====="
"${OPF[@]}" eval "$DATA/aptner_5class_test_clean.jsonl" \
  $EVAL_FLAGS 2>&1 | tee eval_r9_aptner.log

echo ""
echo "===== Viterbi Grid Search (R9 validation) ====="
"$PYTHON" scripts/viterbi_grid_search.py \
  --checkpoint "$CKPT" \
  --val-data "$DATA/r9_5class_valid.jsonl" \
  --output "$RESULTS/viterbi_r9_best.json" \
  --device "$DEVICE" 2>&1 | tee viterbi_r9.log

echo ""
echo "===== Exact-Match Eval R9: CyNER test ====="
"$PYTHON" scripts/eval_exact_match.py \
  --checkpoint "$CKPT" \
  --test-data "$DATA/cyner_test.jsonl" \
  --device "$DEVICE" \
  --decode-mode viterbi \
  --json-out "$RESULTS/r9_cyner_exact_match.json" \
  2>&1 | tee exact_r9_cyner.log

echo ""
echo "===== Exact-Match Eval R9: APTNER independent test ====="
"$PYTHON" scripts/eval_exact_match.py \
  --checkpoint "$CKPT" \
  --test-data "$DATA/aptner_5class_test_clean.jsonl" \
  --device "$DEVICE" \
  --decode-mode viterbi \
  --json-out "$RESULTS/r9_aptner_exact_match.json" \
  2>&1 | tee exact_r9_aptner.log

echo ""
echo "================================================================"
echo "  R9 COMPLETE - $(date)"
echo "================================================================"
echo "Checkpoint:  $CKPT"
echo "Logs:        $TRAIN_LOG, eval_r9_{enriched,cyner,sb2,aptner}.log, exact_r9_{cyner,aptner}.log, viterbi_r9.log"
echo ""
echo "--- Enriched ---"
grep -E '(micro|macro|^  )' eval_r9_enriched.log 2>/dev/null | head -20 || true
echo ""
echo "--- APTNER independent ---"
grep -E '(micro|macro|^  )' eval_r9_aptner.log 2>/dev/null | head -20 || true
echo "================================================================"