frankenstallm / source /scripts /apply_optimizations.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
raw
history blame
6.49 kB
#!/usr/bin/env bash
# =============================================================================
# apply_optimizations.sh β€” Apply v2 optimizations and restart training
#
# Optimizations applied:
# 1. QKV Projection Fusion (+8-12% throughput)
# 2. NUMA CPU Affinity (fix 69% cross-NUMA workers)
# 3. Batch size 4β†’5 (11h saved over full run)
# 4. NCCL NVLS algorithm + 256MB buffers
# 5. DDP bucket_cap_mb 400β†’800
# 6. DataLoader num_workers 4β†’6, prefetch_factor 3β†’4
# 7. MADV_RANDOM + WILLNEED for PackedDataset
# 8. numactl --interleave=all on torchrun
#
# Usage:
# bash scripts/apply_optimizations.sh # full migration
# bash scripts/apply_optimizations.sh --test-only # just validate, don't restart
# bash scripts/apply_optimizations.sh --skip-stop # don't stop current training
# =============================================================================
set -u
cd "$(dirname "$0")/.."
RUN_NAME="korean_3b_fp8_run1"
CKPT_DIR="checkpoints/${RUN_NAME}"
PID_FILE="${CKPT_DIR}/train.pid"
LOG_FILE="${CKPT_DIR}/train.log"
TEST_ONLY=false
SKIP_STOP=false
for arg in "$@"; do
case "$arg" in
--test-only) TEST_ONLY=true ;;
--skip-stop) SKIP_STOP=true ;;
esac
done
echo "=================================================================="
echo " FRANKENSTALLM 3B β€” Optimization Migration v2"
echo " $(date)"
echo "=================================================================="
# ---- Step 1: Validate all modified files --------------------------------
echo ""
echo "[1/6] Validating modified files..."
ERRORS=0
for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do
if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then
echo " βœ“ $pyfile β€” syntax OK"
else
echo " βœ— $pyfile β€” SYNTAX ERROR"
((ERRORS++))
fi
done
if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then
echo " βœ“ scripts/launch_3b_pretrain.sh β€” syntax OK"
else
echo " βœ— scripts/launch_3b_pretrain.sh β€” SYNTAX ERROR"
((ERRORS++))
fi
# Check YAML
python3 -c "
import yaml
with open('configs/korean_3b_fp8.yaml') as f:
cfg = yaml.safe_load(f)
assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}'
print(' βœ“ configs/korean_3b_fp8.yaml β€” valid, batch_size=5')
" 2>/dev/null || { echo " βœ— configs/korean_3b_fp8.yaml β€” INVALID"; ((ERRORS++)); }
if [[ $ERRORS -gt 0 ]]; then
echo ""
echo "[ERROR] $ERRORS file(s) failed validation. Aborting."
exit 1
fi
echo " All files validated successfully."
if $TEST_ONLY; then
echo ""
echo "[INFO] --test-only mode. Exiting without restart."
exit 0
fi
# ---- Step 2: Stop current training (graceful) ---------------------------
if ! $SKIP_STOP; then
echo ""
echo "[2/6] Stopping current training (SIGTERM β†’ emergency checkpoint)..."
if [[ -f "$PID_FILE" ]]; then
PID=$(cat "$PID_FILE")
if kill -0 "$PID" 2>/dev/null; then
echo " Sending SIGTERM to PID $PID..."
kill "$PID"
echo " Waiting for graceful shutdown (up to 120s)..."
for i in $(seq 1 120); do
if ! kill -0 "$PID" 2>/dev/null; then
echo " Process stopped after ${i}s"
break
fi
sleep 1
done
if kill -0 "$PID" 2>/dev/null; then
echo " [WARN] Process still running after 120s. Force killing..."
kill -9 "$PID" 2>/dev/null || true
sleep 2
fi
else
echo " Process $PID not running."
fi
else
echo " No PID file found."
fi
# Wait for all GPU processes to clear
echo " Waiting for GPU processes to terminate..."
for i in $(seq 1 30); do
if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then
echo " All GPU processes cleared."
break
fi
sleep 1
done
fi
# ---- Step 3: Find and migrate latest checkpoint -------------------------
echo ""
echo "[3/6] Migrating latest checkpoint (QKV fusion)..."
LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1)
if [[ -z "$LATEST_CKPT" ]]; then
echo " [ERROR] No checkpoint found!"
exit 1
fi
echo " Latest checkpoint: $LATEST_CKPT"
# Backup original model.pt
cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv"
echo " Backup created: model.pt.backup_pre_qkv"
# Run migration
python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT"
echo " QKV fusion migration complete."
# ---- Step 4: Quick validation test (5 steps) ----------------------------
echo ""
echo "[4/6] Running 5-step validation test..."
# Use single GPU for fast test
timeout 120 python3 train/pretrain.py \
--config configs/korean_3b_fp8.yaml \
--train_data data/3b_train.bin \
--checkpoint_dir /tmp/frankenstallm_test \
--max_steps 5 \
--batch_size 5 \
--resume "$LATEST_CKPT" \
2>&1 | tail -10
TEST_EXIT=$?
if [[ $TEST_EXIT -eq 0 ]]; then
echo " βœ“ 5-step test passed!"
else
echo " βœ— 5-step test FAILED (exit code $TEST_EXIT)"
echo " [WARN] Restoring original checkpoint..."
cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt"
echo " Original checkpoint restored. Aborting."
exit 1
fi
# ---- Step 5: Clean up test artifacts ------------------------------------
echo ""
echo "[5/6] Cleaning up test artifacts..."
rm -rf /tmp/frankenstallm_test
# ---- Step 6: Launch full training with optimizations --------------------
echo ""
echo "[6/6] Launching optimized training..."
echo ""
echo " Changes applied:"
echo " β€’ QKV Projection Fusion (single GEMM)"
echo " β€’ NUMA CPU Affinity (cores 0-35β†’GPU0-3, 36-71β†’GPU4-7)"
echo " β€’ Batch size: 4 β†’ 5"
echo " β€’ NCCL: NVLS,Ring algorithm, 256MB buffers"
echo " β€’ DDP: bucket_cap_mb 400 β†’ 800"
echo " β€’ DataLoader: 4β†’6 workers, prefetch 3β†’4"
echo " β€’ MADV_RANDOM + WILLNEED for dataset mmap"
echo " β€’ numactl --interleave=all on torchrun"
echo ""
bash scripts/launch_3b_pretrain.sh
echo ""
echo "=================================================================="
echo " Migration complete! Monitor with:"
echo " tail -f ${LOG_FILE}"
echo "=================================================================="