| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -u |
|
|
| cd "$(dirname "$0")/.." |
|
|
| RUN_NAME="korean_3b_fp8_run1" |
| CKPT_DIR="checkpoints/${RUN_NAME}" |
| PID_FILE="${CKPT_DIR}/train.pid" |
| LOG_FILE="${CKPT_DIR}/train.log" |
|
|
| TEST_ONLY=false |
| SKIP_STOP=false |
| for arg in "$@"; do |
| case "$arg" in |
| --test-only) TEST_ONLY=true ;; |
| --skip-stop) SKIP_STOP=true ;; |
| esac |
| done |
|
|
| echo "==================================================================" |
| echo " FRANKENSTALLM 3B β Optimization Migration v2" |
| echo " $(date)" |
| echo "==================================================================" |
|
|
| |
| echo "" |
| echo "[1/6] Validating modified files..." |
| ERRORS=0 |
|
|
| for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do |
| if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then |
| echo " β $pyfile β syntax OK" |
| else |
| echo " β $pyfile β SYNTAX ERROR" |
| ((ERRORS++)) |
| fi |
| done |
|
|
| if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then |
| echo " β scripts/launch_3b_pretrain.sh β syntax OK" |
| else |
| echo " β scripts/launch_3b_pretrain.sh β SYNTAX ERROR" |
| ((ERRORS++)) |
| fi |
|
|
| |
| python3 -c " |
| import yaml |
| with open('configs/korean_3b_fp8.yaml') as f: |
| cfg = yaml.safe_load(f) |
| assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}' |
| print(' β configs/korean_3b_fp8.yaml β valid, batch_size=5') |
| " 2>/dev/null || { echo " β configs/korean_3b_fp8.yaml β INVALID"; ((ERRORS++)); } |
|
|
| if [[ $ERRORS -gt 0 ]]; then |
| echo "" |
| echo "[ERROR] $ERRORS file(s) failed validation. Aborting." |
| exit 1 |
| fi |
| echo " All files validated successfully." |
|
|
| if $TEST_ONLY; then |
| echo "" |
| echo "[INFO] --test-only mode. Exiting without restart." |
| exit 0 |
| fi |
|
|
| |
| if ! $SKIP_STOP; then |
| echo "" |
| echo "[2/6] Stopping current training (SIGTERM β emergency checkpoint)..." |
| if [[ -f "$PID_FILE" ]]; then |
| PID=$(cat "$PID_FILE") |
| if kill -0 "$PID" 2>/dev/null; then |
| echo " Sending SIGTERM to PID $PID..." |
| kill "$PID" |
| echo " Waiting for graceful shutdown (up to 120s)..." |
| for i in $(seq 1 120); do |
| if ! kill -0 "$PID" 2>/dev/null; then |
| echo " Process stopped after ${i}s" |
| break |
| fi |
| sleep 1 |
| done |
| if kill -0 "$PID" 2>/dev/null; then |
| echo " [WARN] Process still running after 120s. Force killing..." |
| kill -9 "$PID" 2>/dev/null || true |
| sleep 2 |
| fi |
| else |
| echo " Process $PID not running." |
| fi |
| else |
| echo " No PID file found." |
| fi |
|
|
| |
| echo " Waiting for GPU processes to terminate..." |
| for i in $(seq 1 30); do |
| if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then |
| echo " All GPU processes cleared." |
| break |
| fi |
| sleep 1 |
| done |
| fi |
|
|
| |
| echo "" |
| echo "[3/6] Migrating latest checkpoint (QKV fusion)..." |
| LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1) |
| if [[ -z "$LATEST_CKPT" ]]; then |
| echo " [ERROR] No checkpoint found!" |
| exit 1 |
| fi |
| echo " Latest checkpoint: $LATEST_CKPT" |
|
|
| |
| cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv" |
| echo " Backup created: model.pt.backup_pre_qkv" |
|
|
| |
| python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT" |
| echo " QKV fusion migration complete." |
|
|
| |
| echo "" |
| echo "[4/6] Running 5-step validation test..." |
| |
| timeout 120 python3 train/pretrain.py \ |
| --config configs/korean_3b_fp8.yaml \ |
| --train_data data/3b_train.bin \ |
| --checkpoint_dir /tmp/frankenstallm_test \ |
| --max_steps 5 \ |
| --batch_size 5 \ |
| --resume "$LATEST_CKPT" \ |
| 2>&1 | tail -10 |
|
|
| TEST_EXIT=$? |
| if [[ $TEST_EXIT -eq 0 ]]; then |
| echo " β 5-step test passed!" |
| else |
| echo " β 5-step test FAILED (exit code $TEST_EXIT)" |
| echo " [WARN] Restoring original checkpoint..." |
| cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt" |
| echo " Original checkpoint restored. Aborting." |
| exit 1 |
| fi |
|
|
| |
| echo "" |
| echo "[5/6] Cleaning up test artifacts..." |
| rm -rf /tmp/frankenstallm_test |
|
|
| |
| echo "" |
| echo "[6/6] Launching optimized training..." |
| echo "" |
| echo " Changes applied:" |
| echo " β’ QKV Projection Fusion (single GEMM)" |
| echo " β’ NUMA CPU Affinity (cores 0-35βGPU0-3, 36-71βGPU4-7)" |
| echo " β’ Batch size: 4 β 5" |
| echo " β’ NCCL: NVLS,Ring algorithm, 256MB buffers" |
| echo " β’ DDP: bucket_cap_mb 400 β 800" |
| echo " β’ DataLoader: 4β6 workers, prefetch 3β4" |
| echo " β’ MADV_RANDOM + WILLNEED for dataset mmap" |
| echo " β’ numactl --interleave=all on torchrun" |
| echo "" |
|
|
| bash scripts/launch_3b_pretrain.sh |
|
|
| echo "" |
| echo "==================================================================" |
| echo " Migration complete! Monitor with:" |
| echo " tail -f ${LOG_FILE}" |
| echo "==================================================================" |
|
|