File size: 6,488 Bytes
48ecd01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env bash
# =============================================================================
# apply_optimizations.sh β€” Apply v2 optimizations and restart training
#
# Optimizations applied:
#   1. QKV Projection Fusion (+8-12% throughput)
#   2. NUMA CPU Affinity (fix 69% cross-NUMA workers)
#   3. Batch size 4β†’5 (11h saved over full run)
#   4. NCCL NVLS algorithm + 256MB buffers
#   5. DDP bucket_cap_mb 400β†’800
#   6. DataLoader num_workers 4β†’6, prefetch_factor 3β†’4
#   7. MADV_RANDOM + WILLNEED for PackedDataset
#   8. numactl --interleave=all on torchrun
#
# Usage:
#   bash scripts/apply_optimizations.sh              # full migration
#   bash scripts/apply_optimizations.sh --test-only  # just validate, don't restart
#   bash scripts/apply_optimizations.sh --skip-stop  # don't stop current training
# =============================================================================
set -u

cd "$(dirname "$0")/.."

RUN_NAME="korean_3b_fp8_run1"
CKPT_DIR="checkpoints/${RUN_NAME}"
PID_FILE="${CKPT_DIR}/train.pid"
LOG_FILE="${CKPT_DIR}/train.log"

TEST_ONLY=false
SKIP_STOP=false
for arg in "$@"; do
    case "$arg" in
        --test-only) TEST_ONLY=true ;;
        --skip-stop) SKIP_STOP=true ;;
    esac
done

echo "=================================================================="
echo "  FRANKENSTALLM 3B β€” Optimization Migration v2"
echo "  $(date)"
echo "=================================================================="

# ---- Step 1: Validate all modified files --------------------------------
echo ""
echo "[1/6] Validating modified files..."
ERRORS=0

for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do
    if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then
        echo "  βœ“ $pyfile β€” syntax OK"
    else
        echo "  βœ— $pyfile β€” SYNTAX ERROR"
        ((ERRORS++))
    fi
done

if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then
    echo "  βœ“ scripts/launch_3b_pretrain.sh β€” syntax OK"
else
    echo "  βœ— scripts/launch_3b_pretrain.sh β€” SYNTAX ERROR"
    ((ERRORS++))
fi

# Check YAML
python3 -c "
import yaml
with open('configs/korean_3b_fp8.yaml') as f:
    cfg = yaml.safe_load(f)
assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}'
print('  βœ“ configs/korean_3b_fp8.yaml β€” valid, batch_size=5')
" 2>/dev/null || { echo "  βœ— configs/korean_3b_fp8.yaml β€” INVALID"; ((ERRORS++)); }

if [[ $ERRORS -gt 0 ]]; then
    echo ""
    echo "[ERROR] $ERRORS file(s) failed validation. Aborting."
    exit 1
fi
echo "  All files validated successfully."

if $TEST_ONLY; then
    echo ""
    echo "[INFO] --test-only mode. Exiting without restart."
    exit 0
fi

# ---- Step 2: Stop current training (graceful) ---------------------------
if ! $SKIP_STOP; then
    echo ""
    echo "[2/6] Stopping current training (SIGTERM β†’ emergency checkpoint)..."
    if [[ -f "$PID_FILE" ]]; then
        PID=$(cat "$PID_FILE")
        if kill -0 "$PID" 2>/dev/null; then
            echo "  Sending SIGTERM to PID $PID..."
            kill "$PID"
            echo "  Waiting for graceful shutdown (up to 120s)..."
            for i in $(seq 1 120); do
                if ! kill -0 "$PID" 2>/dev/null; then
                    echo "  Process stopped after ${i}s"
                    break
                fi
                sleep 1
            done
            if kill -0 "$PID" 2>/dev/null; then
                echo "  [WARN] Process still running after 120s. Force killing..."
                kill -9 "$PID" 2>/dev/null || true
                sleep 2
            fi
        else
            echo "  Process $PID not running."
        fi
    else
        echo "  No PID file found."
    fi

    # Wait for all GPU processes to clear
    echo "  Waiting for GPU processes to terminate..."
    for i in $(seq 1 30); do
        if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then
            echo "  All GPU processes cleared."
            break
        fi
        sleep 1
    done
fi

# ---- Step 3: Find and migrate latest checkpoint -------------------------
echo ""
echo "[3/6] Migrating latest checkpoint (QKV fusion)..."
LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1)
if [[ -z "$LATEST_CKPT" ]]; then
    echo "  [ERROR] No checkpoint found!"
    exit 1
fi
echo "  Latest checkpoint: $LATEST_CKPT"

# Backup original model.pt
cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv"
echo "  Backup created: model.pt.backup_pre_qkv"

# Run migration
python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT"
echo "  QKV fusion migration complete."

# ---- Step 4: Quick validation test (5 steps) ----------------------------
echo ""
echo "[4/6] Running 5-step validation test..."
# Use single GPU for fast test
timeout 120 python3 train/pretrain.py \
    --config configs/korean_3b_fp8.yaml \
    --train_data data/3b_train.bin \
    --checkpoint_dir /tmp/frankenstallm_test \
    --max_steps 5 \
    --batch_size 5 \
    --resume "$LATEST_CKPT" \
    2>&1 | tail -10

TEST_EXIT=$?
if [[ $TEST_EXIT -eq 0 ]]; then
    echo "  βœ“ 5-step test passed!"
else
    echo "  βœ— 5-step test FAILED (exit code $TEST_EXIT)"
    echo "  [WARN] Restoring original checkpoint..."
    cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt"
    echo "  Original checkpoint restored. Aborting."
    exit 1
fi

# ---- Step 5: Clean up test artifacts ------------------------------------
echo ""
echo "[5/6] Cleaning up test artifacts..."
rm -rf /tmp/frankenstallm_test

# ---- Step 6: Launch full training with optimizations --------------------
echo ""
echo "[6/6] Launching optimized training..."
echo ""
echo "  Changes applied:"
echo "    β€’ QKV Projection Fusion (single GEMM)"
echo "    β€’ NUMA CPU Affinity (cores 0-35β†’GPU0-3, 36-71β†’GPU4-7)"
echo "    β€’ Batch size: 4 β†’ 5"
echo "    β€’ NCCL: NVLS,Ring algorithm, 256MB buffers"
echo "    β€’ DDP: bucket_cap_mb 400 β†’ 800"
echo "    β€’ DataLoader: 4β†’6 workers, prefetch 3β†’4"
echo "    β€’ MADV_RANDOM + WILLNEED for dataset mmap"
echo "    β€’ numactl --interleave=all on torchrun"
echo ""

bash scripts/launch_3b_pretrain.sh

echo ""
echo "=================================================================="
echo "  Migration complete! Monitor with:"
echo "    tail -f ${LOG_FILE}"
echo "=================================================================="