arcspan / scripts /run_train_v8.sh

Add files using upload-large-folder tool

3dac39e verified 8 days ago

5.98 kB

	#!/bin/bash
	# =============================================================================
	# Round 8: FINAL training run on DELEAKED 5-class cybersecurity NER data
	#
	# Key differences from R6:
	# - Data: r8_5class_{train,valid}.jsonl — fully deleaked (no train/test overlap)
	# - O-token downsampling: --o-downsample 0.7 (mask 70% of O-tokens from loss)
	# This should boost entity recall given ~85-90% O-token prevalence
	# - Four evaluation test sets (incl. APTNER zero-leakage independent benchmark)
	# - Same proven hyperparams: focal(γ=2), cosine LR, LLRD 0.9, LR 5e-5
	#
	# Expect slightly worse val_loss vs R6 (honest eval on clean data).
	# GPU: RTX PRO 6000 96GB VRAM
	# =============================================================================
	set -euo pipefail
	cd ~/alkyline

	export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

	# ---------------------------------------------------------------------------
	# Config
	# ---------------------------------------------------------------------------
	PATIENCE=3
	DATA=data/processed
	LABELS=data/label_spaces
	TRAIN_LOG=train_r8.log
	CKPT_DIR=checkpoints/r8_5class

	# ---------------------------------------------------------------------------
	# Training
	# ---------------------------------------------------------------------------
	echo "===== R8: Deleaked 5-class (focal + cosine + LLRD + O-downsample) ====="
	echo "Start time: $(date)"

	opf train "$DATA/r8_5class_train.jsonl" \
	--validation-dataset "$DATA/r8_5class_valid.jsonl" \
	--label-space-json "$LABELS/cyner_5class.json" \
	--output-dir "$CKPT_DIR" \
	--overwrite-output \
	--epochs 15 --batch-size 4 --grad-accum-steps 2 \
	--learning-rate 5e-5 \
	--warmup-fraction 0.1 --lr-schedule cosine \
	--loss-fn focal --focal-gamma 2.0 \
	--llrd-factor 0.9 \
	--o-downsample 0.7 \
	--device cuda 2>&1 \| tee "$TRAIN_LOG" &
	TRAIN_PID=$!

	# Early stopping monitor — kills training if val_loss hasn't improved for $PATIENCE epochs
	bash scripts/early_stop_monitor.sh "$TRAIN_LOG" "$PATIENCE" "$TRAIN_PID" &
	MONITOR_PID=$!

	wait $TRAIN_PID 2>/dev/null \|\| true
	kill $MONITOR_PID 2>/dev/null \|\| true

	echo "Training finished: $(date)"

	# ---------------------------------------------------------------------------
	# Best-epoch checkpoint selection
	#
	# The trainer saves per-epoch checkpoints as epoch_N/ and a final model.
	# If early stopping killed training, the final model.safetensors may not
	# exist. In that case, find the epoch with the lowest val_loss and use it.
	# ---------------------------------------------------------------------------
	BEST_EPOCH=$(grep '^epoch' "$TRAIN_LOG" \
	\| awk -F'[ :/=]' '{for(i=1;i<=NF;i++){if($i=="val_loss")print $(i+1)" "$2}}' \
	\| sort -n \| head -1 \| awk '{print $2}')
	echo "Best epoch by val_loss: $BEST_EPOCH"

	CKPT="$CKPT_DIR"
	if [ ! -f "$CKPT/model.safetensors" ] && [ -n "$BEST_EPOCH" ]; then
	echo "Training killed early — using epoch $BEST_EPOCH checkpoint"
	CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
	elif [ -n "$BEST_EPOCH" ] && [ -d "${CKPT_DIR}/epoch_${BEST_EPOCH}" ]; then
	# Even if training ran to completion, prefer the best-epoch checkpoint
	echo "Using best-epoch checkpoint (epoch $BEST_EPOCH) over final"
	CKPT="${CKPT_DIR}/epoch_${BEST_EPOCH}"
	fi
	echo "Selected checkpoint: $CKPT"

	# ---------------------------------------------------------------------------
	# Evaluation — four test sets
	# ---------------------------------------------------------------------------
	EVAL_FLAGS="--checkpoint $CKPT --decode-mode viterbi --per-class --label-counts --device cuda"

	echo ""
	echo "================================================================"
	echo " EVALUATION PHASE"
	echo "================================================================"

	# 1) Primary benchmark: enriched 5-class test
	echo ""
	echo "===== Eval R8: Enriched 5-class test (primary benchmark) ====="
	opf eval "$DATA/enriched_5class_test.jsonl" \
	$EVAL_FLAGS 2>&1 \| tee eval_r8_enriched.log

	# 2) CyNER test (supplementary — 98% overlap with enriched)
	echo ""
	echo "===== Eval R8: CyNER test (supplementary, ~98% overlap) ====="
	opf eval "$DATA/cyner_test.jsonl" \
	$EVAL_FLAGS 2>&1 \| tee eval_r8_cyner.log

	# 3) SecureBERT2 test (supplementary — 96% overlap)
	echo ""
	echo "===== Eval R8: SecureBERT2 5-class test (supplementary, ~96% overlap) ====="
	opf eval "$DATA/securebert2_5class_test.jsonl" \
	$EVAL_FLAGS 2>&1 \| tee eval_r8_sb2.log

	# 4) APTNER — independent benchmark (ZERO leakage, most important)
	echo ""
	echo "===== Eval R8: APTNER 5-class test (INDEPENDENT — zero leakage) ====="
	opf eval "$DATA/aptner_5class_test_clean.jsonl" \
	$EVAL_FLAGS 2>&1 \| tee eval_r8_aptner.log

	# ---------------------------------------------------------------------------
	# Viterbi grid search on validation set
	# Find optimal Viterbi transition penalties for inference
	# ---------------------------------------------------------------------------
	echo ""
	echo "===== Viterbi Grid Search (on validation set) ====="
	python3 scripts/viterbi_grid_search.py \
	--checkpoint "$CKPT" \
	--val-data "$DATA/r8_5class_valid.jsonl" \
	--output results/viterbi_r8_best.json \
	--device cuda 2>&1 \| tee viterbi_r8.log

	# ---------------------------------------------------------------------------
	# Summary
	# ---------------------------------------------------------------------------
	echo ""
	echo "================================================================"
	echo " R8 COMPLETE — $(date)"
	echo "================================================================"
	echo "Checkpoint: $CKPT"
	echo "Logs: $TRAIN_LOG, eval_r8_{enriched,cyner,sb2,aptner}.log, viterbi_r8.log"
	echo ""
	echo "Key results:"
	echo "--- Enriched (primary) ---"
	grep -E '(micro\|macro\|^ )' eval_r8_enriched.log 2>/dev/null \| head -20 \|\| true
	echo ""
	echo "--- APTNER (independent) ---"
	grep -E '(micro\|macro\|^ )' eval_r8_aptner.log 2>/dev/null \| head -20 \|\| true
	echo "================================================================"