File size: 6,488 Bytes
48ecd01 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | #!/usr/bin/env bash
# =============================================================================
# apply_optimizations.sh β Apply v2 optimizations and restart training
#
# Optimizations applied:
# 1. QKV Projection Fusion (+8-12% throughput)
# 2. NUMA CPU Affinity (fix 69% cross-NUMA workers)
# 3. Batch size 4β5 (11h saved over full run)
# 4. NCCL NVLS algorithm + 256MB buffers
# 5. DDP bucket_cap_mb 400β800
# 6. DataLoader num_workers 4β6, prefetch_factor 3β4
# 7. MADV_RANDOM + WILLNEED for PackedDataset
# 8. numactl --interleave=all on torchrun
#
# Usage:
# bash scripts/apply_optimizations.sh # full migration
# bash scripts/apply_optimizations.sh --test-only # just validate, don't restart
# bash scripts/apply_optimizations.sh --skip-stop # don't stop current training
# =============================================================================
set -u
cd "$(dirname "$0")/.."
RUN_NAME="korean_3b_fp8_run1"
CKPT_DIR="checkpoints/${RUN_NAME}"
PID_FILE="${CKPT_DIR}/train.pid"
LOG_FILE="${CKPT_DIR}/train.log"
TEST_ONLY=false
SKIP_STOP=false
for arg in "$@"; do
case "$arg" in
--test-only) TEST_ONLY=true ;;
--skip-stop) SKIP_STOP=true ;;
esac
done
echo "=================================================================="
echo " FRANKENSTALLM 3B β Optimization Migration v2"
echo " $(date)"
echo "=================================================================="
# ---- Step 1: Validate all modified files --------------------------------
echo ""
echo "[1/6] Validating modified files..."
ERRORS=0
for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do
if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then
echo " β $pyfile β syntax OK"
else
echo " β $pyfile β SYNTAX ERROR"
((ERRORS++))
fi
done
if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then
echo " β scripts/launch_3b_pretrain.sh β syntax OK"
else
echo " β scripts/launch_3b_pretrain.sh β SYNTAX ERROR"
((ERRORS++))
fi
# Check YAML
python3 -c "
import yaml
with open('configs/korean_3b_fp8.yaml') as f:
cfg = yaml.safe_load(f)
assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}'
print(' β configs/korean_3b_fp8.yaml β valid, batch_size=5')
" 2>/dev/null || { echo " β configs/korean_3b_fp8.yaml β INVALID"; ((ERRORS++)); }
if [[ $ERRORS -gt 0 ]]; then
echo ""
echo "[ERROR] $ERRORS file(s) failed validation. Aborting."
exit 1
fi
echo " All files validated successfully."
if $TEST_ONLY; then
echo ""
echo "[INFO] --test-only mode. Exiting without restart."
exit 0
fi
# ---- Step 2: Stop current training (graceful) ---------------------------
if ! $SKIP_STOP; then
echo ""
echo "[2/6] Stopping current training (SIGTERM β emergency checkpoint)..."
if [[ -f "$PID_FILE" ]]; then
PID=$(cat "$PID_FILE")
if kill -0 "$PID" 2>/dev/null; then
echo " Sending SIGTERM to PID $PID..."
kill "$PID"
echo " Waiting for graceful shutdown (up to 120s)..."
for i in $(seq 1 120); do
if ! kill -0 "$PID" 2>/dev/null; then
echo " Process stopped after ${i}s"
break
fi
sleep 1
done
if kill -0 "$PID" 2>/dev/null; then
echo " [WARN] Process still running after 120s. Force killing..."
kill -9 "$PID" 2>/dev/null || true
sleep 2
fi
else
echo " Process $PID not running."
fi
else
echo " No PID file found."
fi
# Wait for all GPU processes to clear
echo " Waiting for GPU processes to terminate..."
for i in $(seq 1 30); do
if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then
echo " All GPU processes cleared."
break
fi
sleep 1
done
fi
# ---- Step 3: Find and migrate latest checkpoint -------------------------
echo ""
echo "[3/6] Migrating latest checkpoint (QKV fusion)..."
LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1)
if [[ -z "$LATEST_CKPT" ]]; then
echo " [ERROR] No checkpoint found!"
exit 1
fi
echo " Latest checkpoint: $LATEST_CKPT"
# Backup original model.pt
cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv"
echo " Backup created: model.pt.backup_pre_qkv"
# Run migration
python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT"
echo " QKV fusion migration complete."
# ---- Step 4: Quick validation test (5 steps) ----------------------------
echo ""
echo "[4/6] Running 5-step validation test..."
# Use single GPU for fast test
timeout 120 python3 train/pretrain.py \
--config configs/korean_3b_fp8.yaml \
--train_data data/3b_train.bin \
--checkpoint_dir /tmp/frankenstallm_test \
--max_steps 5 \
--batch_size 5 \
--resume "$LATEST_CKPT" \
2>&1 | tail -10
TEST_EXIT=$?
if [[ $TEST_EXIT -eq 0 ]]; then
echo " β 5-step test passed!"
else
echo " β 5-step test FAILED (exit code $TEST_EXIT)"
echo " [WARN] Restoring original checkpoint..."
cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt"
echo " Original checkpoint restored. Aborting."
exit 1
fi
# ---- Step 5: Clean up test artifacts ------------------------------------
echo ""
echo "[5/6] Cleaning up test artifacts..."
rm -rf /tmp/frankenstallm_test
# ---- Step 6: Launch full training with optimizations --------------------
echo ""
echo "[6/6] Launching optimized training..."
echo ""
echo " Changes applied:"
echo " β’ QKV Projection Fusion (single GEMM)"
echo " β’ NUMA CPU Affinity (cores 0-35βGPU0-3, 36-71βGPU4-7)"
echo " β’ Batch size: 4 β 5"
echo " β’ NCCL: NVLS,Ring algorithm, 256MB buffers"
echo " β’ DDP: bucket_cap_mb 400 β 800"
echo " β’ DataLoader: 4β6 workers, prefetch 3β4"
echo " β’ MADV_RANDOM + WILLNEED for dataset mmap"
echo " β’ numactl --interleave=all on torchrun"
echo ""
bash scripts/launch_3b_pretrain.sh
echo ""
echo "=================================================================="
echo " Migration complete! Monitor with:"
echo " tail -f ${LOG_FILE}"
echo "=================================================================="
|