File size: 10,504 Bytes
48ecd01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env bash
# =============================================================================
# launch_3b_pretrain.sh โ€” 8-GPU FP8 pretraining launcher for Korean 3B LLM
#
# Features:
#   - SIGHUP ๋ฐฉ์–ด: SSH ๋Š๊น€ ์‹œ ์ž๋™์œผ๋กœ nohup+setsid๋กœ ์„ธ์…˜ ๋ณดํ˜ธ
#   - Graceful shutdown: SIGTERM ์‹œ Python ์‹œ๊ทธ๋„ ํ•ธ๋“ค๋Ÿฌ๊ฐ€ ๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ
#   - ์ž๋™ resume: ์ตœ์‹  ์ฒดํฌํฌ์ธํŠธ์—์„œ ์ž๋™ ์žฌ๊ฐœ
#   - PID ํŒŒ์ผ: ํ”„๋กœ์„ธ์Šค ๋ชจ๋‹ˆํ„ฐ๋ง ๋ฐ ์ œ์–ด์šฉ
#   - grep ํŒŒ์ดํ”„๋ผ์ธ exit code ๋ณดํ˜ธ (|| true)
#
# Usage:
#   bash scripts/launch_3b_pretrain.sh                          # full run (60B tokens)
#   bash scripts/launch_3b_pretrain.sh --max_steps 500          # quick test
#   bash scripts/launch_3b_pretrain.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-0010000
#   MAX_STEPS=95000 bash scripts/launch_3b_pretrain.sh          # 100B tokens
#
# ๋ชจ๋‹ˆํ„ฐ๋ง:
#   tail -f checkpoints/korean_3b_fp8_run1/train.log
#   cat checkpoints/korean_3b_fp8_run1/train.pid
#
# ์ค‘์ง€ (๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ์ €์žฅ):
#   kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)
#
# ๊ฐ•์ œ ์ข…๋ฃŒ (์ฒดํฌํฌ์ธํŠธ ์ €์žฅ ์—†์Œ):
#   kill -9 $(cat checkpoints/korean_3b_fp8_run1/train.pid)
# =============================================================================

# -u: ๋ฏธ์ •์˜ ๋ณ€์ˆ˜ ์—๋Ÿฌ
# NOTE: -e, -o pipefail ์˜๋„์  ์ œ๊ฑฐ
#   ์ด์ „ ๋ฌธ์ œ: grep ํŒŒ์ดํ”„๋ผ์ธ์—์„œ ๋ชจ๋“  ๋ผ์ธ์ด ํ•„ํ„ฐ๋ง๋˜๋ฉด exit code 1 ๋ฐ˜ํ™˜
#   โ†’ pipefail์ด ์ด๋ฅผ ์Šคํฌ๋ฆฝํŠธ ์‹คํŒจ๋กœ ์ „ํŒŒ โ†’ ํ•™์Šต ์ค‘๋‹จ
#   ํ•ด๊ฒฐ: set -e/pipefail ์ œ๊ฑฐ + grep ์ฒด์ธ์— || true ์ถ”๊ฐ€
set -u

# ---- Configurable defaults --------------------------------------------------
RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
CONFIG="${CONFIG:-configs/korean_3b_fp8.yaml}"
TRAIN_DATA="${TRAIN_DATA:-data/3b_train.bin}"
VAL_DATA="${VAL_DATA:-data/3b_val.bin}"
CKPT_DIR="checkpoints/${RUN_NAME}"
LOG_FILE="${CKPT_DIR}/train.log"
NPROC=8
MASTER_PORT="${MASTER_PORT:-29501}"

MAX_STEPS="${MAX_STEPS:-57000}"
BATCH_SIZE=5
GRAD_ACCUM=8
WARMUP_STEPS=2000
SEED=42

# ---- B200 / NVSwitch single-node NCCL tuning (3B optimized, v2) ----------
export NCCL_IB_DISABLE=1
export NCCL_ALGO=NVLS,Ring              # NVSwitch hardware reduction first (was Ring,Tree)
export NCCL_PROTO=Simple
export NCCL_NVLS_ENABLE=1               # NVLink SHARP โ€” hardware-accelerated all-reduce
export NCCL_MIN_NCHANNELS=32            # raise minimum for NVSwitch headroom (was 16)
export NCCL_MAX_NCHANNELS=32
export NCCL_BUFFSIZE=268435456          # 256MB (was 128MB) โ€” reduces bucket pipeline stalls
export NCCL_P2P_LEVEL=NVL
export NCCL_NET_GDR_LEVEL=0
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Triton/Inductor cache on executable filesystem (not /tmp which is noexec)
export TRITON_CUDACRT_PATH=/usr/local/cuda/include
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas

cd "$(dirname "$0")/.."

mkdir -p "${CKPT_DIR}"

# ---- Session protection (SIGHUP ๋ฐฉ์–ด) ---------------------------------------
# tmux/screen ์—†์ด ์‹คํ–‰ ์‹œ, ์ž๋™์œผ๋กœ nohup + setsid๋กœ ๋ž˜ํ•‘ํ•˜์—ฌ
# SSH ๋Š๊น€(SIGHUP)์œผ๋กœ๋ถ€ํ„ฐ ํ•™์Šต ํ”„๋กœ์„ธ์Šค๋ฅผ ๋ณดํ˜ธํ•ฉ๋‹ˆ๋‹ค.
#
# ์ž‘๋™ ์›๋ฆฌ:
#   1. tmux/screen/์ด๋ฏธ ๋ณดํ˜ธ๋จ ์—ฌ๋ถ€ ํ™•์ธ
#   2. ๋ฏธ๋ณดํ˜ธ ์ƒํƒœ์ด๋ฉด _LAUNCH_PROTECTED=1 ์„ค์ • ํ›„ nohup setsid๋กœ ์ž๊ธฐ ์ž์‹ ์„ ์žฌ์‹คํ–‰
#   3. ์žฌ์‹คํ–‰๋œ ํ”„๋กœ์„ธ์Šค๋Š” ์ƒˆ๋กœ์šด ์„ธ์…˜ ๋ฆฌ๋”๊ฐ€ ๋˜์–ด ํ„ฐ๋ฏธ๋„๊ณผ ๋ถ„๋ฆฌ๋จ
#   4. ์›๋ž˜ ์…ธ์€ PID์™€ ๋ชจ๋‹ˆํ„ฐ๋ง ๋ช…๋ น์„ ์ถœ๋ ฅํ•˜๊ณ  ์ฆ‰์‹œ ์ข…๋ฃŒ
PID_FILE="${CKPT_DIR}/train.pid"

if [[ -z "${_LAUNCH_PROTECTED:-}" ]] && [[ -z "${TMUX:-}" ]] && [[ -z "${STY:-}" ]]; then
    export _LAUNCH_PROTECTED=1
    NOHUP_LOG="${CKPT_DIR}/launch_$(date +%Y%m%d_%H%M%S).log"

    echo "=================================================================="
    echo "  SIGHUP PROTECTION ACTIVATED"
    echo "  tmux/screen ๋ฏธ๊ฐ์ง€ โ†’ ์„ธ์…˜ ๋ณดํ˜ธ ๋ชจ๋“œ ์ž๋™ ํ™œ์„ฑํ™” (nohup + setsid)"
    echo "  SSH ๋Š์–ด์ ธ๋„ ํ•™์Šต์ด ๊ณ„์†๋ฉ๋‹ˆ๋‹ค."
    echo "=================================================================="
    echo ""

    # ์ž๊ธฐ ์ž์‹ ์„ ์„ธ์…˜ ๋ณดํ˜ธ ๋ชจ๋“œ๋กœ ์žฌ์‹คํ–‰
    nohup setsid bash "$0" "$@" > "${NOHUP_LOG}" 2>&1 &
    BG_PID=$!
    echo "${BG_PID}" > "${PID_FILE}"

    echo "  PID         : ${BG_PID}"
    echo "  PID ํŒŒ์ผ    : ${PID_FILE}"
    echo "  Launch ๋กœ๊ทธ : ${NOHUP_LOG}"
    echo "  ํ•™์Šต ๋กœ๊ทธ   : ${LOG_FILE}"
    echo ""
    echo "  ๋ชจ๋‹ˆํ„ฐ๋ง:"
    echo "    tail -f ${LOG_FILE}"
    echo ""
    echo "  ์ค‘์ง€ (๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ์ €์žฅ):"
    echo "    kill \$(cat ${PID_FILE})"
    echo ""
    echo "  ๊ฐ•์ œ ์ข…๋ฃŒ:"
    echo "    kill -9 \$(cat ${PID_FILE})"
    echo "=================================================================="
    exit 0
fi

# ---- Cleanup on exit --------------------------------------------------------
PREWARM_PID=""

cleanup() {
    rm -f "${PID_FILE}" 2>/dev/null || true
    if [[ -n "${PREWARM_PID:-}" ]]; then
        kill "${PREWARM_PID}" 2>/dev/null || true
    fi
}
trap cleanup EXIT

# PID ํŒŒ์ผ ๊ธฐ๋ก (tmux/screen ๋‚ด์—์„œ ์‹คํ–‰ ์‹œ์—๋„ PID ์ถ”์  ๊ฐ€๋Šฅ)
echo "$$" > "${PID_FILE}"

# ---- Pre-flight checks ------------------------------------------------------
if [[ ! -f "${CONFIG}" ]]; then
    echo "[ERROR] Config not found: ${CONFIG}"
    exit 1
fi

if [[ ! -f "${TRAIN_DATA}" ]]; then
    echo "[ERROR] Training data not found: ${TRAIN_DATA}"
    exit 1
fi

# GPU ๋ฉ”๋ชจ๋ฆฌ ์ฒดํฌ (3B๋Š” ์ตœ์†Œ 80GB/GPU ๊ถŒ์žฅ, B200=192GB โ†’ OK)
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 80000 ]]; then
    echo "[WARN] GPU memory ${GPU_MEM}MB < 80GB. 3B ํ•™์Šต์— ๋ถ€์กฑํ•  ์ˆ˜ ์žˆ์Œ."
fi

# ์ค‘๋ณต ํ”„๋กœ์„ธ์Šค ๋ฐฉ์ง€
EXISTING_PID=$(pgrep -f "pretrain.py.*korean_3b" 2>/dev/null | head -1 || true)
if [[ -n "$EXISTING_PID" ]]; then
    echo "[ERROR] ์ด๋ฏธ 3B pretrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰ ์ค‘ (PID: ${EXISTING_PID})"
    echo "        kill ${EXISTING_PID} ๋กœ ๋จผ์ € ์ข…๋ฃŒํ•˜์„ธ์š”."
    exit 1
fi

# ๋””์Šคํฌ ์—ฌ์œ  ํ™•์ธ (์ตœ์†Œ 1TB ํ•„์š”)
AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}')
if [[ -n "${AVAIL_KB:-}" ]] && [[ "$AVAIL_KB" -lt 1073741824 ]]; then
    AVAIL_TB=$(echo "scale=1; $AVAIL_KB / 1073741824" | bc 2>/dev/null || echo "?")
    echo "[WARN] /PROJECT ์—ฌ์œ  ${AVAIL_TB}TB < 1TB. ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ ๊ณต๊ฐ„ ๋ถ€์กฑ ๊ฐ€๋Šฅ."
fi

# ---- Resume detection -------------------------------------------------------
RESUME_ARG=""
EXTRA_ARGS="${*:-}"
if [[ ! "${EXTRA_ARGS}" =~ "--resume" ]]; then
    # ๊ฐ€์žฅ ์ตœ๊ทผ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ๊ฐ์ง€
    LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
    if [[ -n "$LATEST_CKPT" ]]; then
        echo "[INFO] ์ž๋™ resume ๊ฐ์ง€: ${LATEST_CKPT}"
        RESUME_ARG="--resume ${LATEST_CKPT}"
    fi
fi

# ---- Banner ------------------------------------------------------------------
SESSION_TYPE="direct"
[[ -n "${TMUX:-}" ]] && SESSION_TYPE="tmux"
[[ -n "${STY:-}" ]] && SESSION_TYPE="screen"
[[ -n "${_LAUNCH_PROTECTED:-}" ]] && SESSION_TYPE="protected (nohup+setsid)"

echo "=================================================================="
echo "  Korean 3B LLM Pre-Training (FP8)"
echo "  Run name    : ${RUN_NAME}"
echo "  Config      : ${CONFIG}"
echo "  CKPT dir    : ${CKPT_DIR}"
echo "  Log file    : ${LOG_FILE}"
echo "  Max steps   : ${MAX_STEPS}"
echo "  Batch       : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} accum"
echo "  Eff tokens  : $((BATCH_SIZE * NPROC * GRAD_ACCUM * 4096)) tokens/step (~1M)"
echo "  Total tokens: ~$((MAX_STEPS * BATCH_SIZE * NPROC * GRAD_ACCUM * 4096 / 1000000000))B"
echo "  Resume      : ${RESUME_ARG:-none (fresh start)}"
echo "  Session     : ${SESSION_TYPE}"
echo "  PID         : $$ (file: ${PID_FILE})"
echo "  Started     : $(date)"
echo "=================================================================="

export PYTHONWARNINGS="ignore::UserWarning:torch.library"

# ---- Pre-warm OS page cache (NUMA-interleaved, non-blocking) ---------------
if [[ -f "${TRAIN_DATA}" ]]; then
    echo "[INFO] Pre-warming page cache for ${TRAIN_DATA} (NUMA interleaved)..."
    numactl --interleave=all dd if="${TRAIN_DATA}" of=/dev/null bs=16M 2>/dev/null &
    PREWARM_PID=$!
fi

# ---- Launch training ---------------------------------------------------------
# grep ํŒŒ์ดํ”„๋ผ์ธ ๋ณดํ˜ธ:
#   ๋ฌธ์ œ: grep -v ๊ฐ€ ๋งค์นญ ๋ผ์ธ์ด ์—†์œผ๋ฉด exit code 1 ๋ฐ˜ํ™˜
#   ํ•ด๊ฒฐ: { ... || true; } ๋ž˜ํ•‘์œผ๋กœ ํŒŒ์ดํ”„๋ผ์ธ exit code๋ฅผ ํ•ญ์ƒ 0์œผ๋กœ ๋ณด์žฅ
#   torchrun์˜ ์‹ค์ œ exit code๋Š” PIPESTATUS[0]์œผ๋กœ ๋ณ„๋„ ์บก์ฒ˜
numactl --interleave=all \
torchrun \
    --nproc_per_node=${NPROC} \
    --master_port=${MASTER_PORT} \
    train/pretrain.py \
    --config "${CONFIG}" \
    --train_data "${TRAIN_DATA}" \
    --val_data "${VAL_DATA}" \
    --checkpoint_dir "${CKPT_DIR}" \
    --log_file "${LOG_FILE}" \
    --max_steps ${MAX_STEPS} \
    --batch_size ${BATCH_SIZE} \
    --grad_accum ${GRAD_ACCUM} \
    --warmup_steps ${WARMUP_STEPS} \
    --seed ${SEED} \
    ${RESUME_ARG} \
    ${EXTRA_ARGS} \
    2>&1 | { grep -v "UserWarning" \
           | grep -v "Warning only once" \
           | grep -v "Overriding a previously" \
           | grep -v "dispatch key:" \
           | grep -v "previous kernel:" \
           | grep -v "new kernel:" \
           | grep -v "operator: flash_attn" \
           | grep -v "registered at /usr/local" \
           | grep -v "self.m.impl" \
           || true; }

EXIT_CODE=${PIPESTATUS[0]}

# ---- Exit summary ------------------------------------------------------------
echo ""
echo "=================================================================="
echo "  Finished  : $(date)"
echo "  Exit code : ${EXIT_CODE}"
if [[ ${EXIT_CODE} -eq 0 ]]; then
    echo "  Status    : SUCCESS (ํ•™์Šต ์™„๋ฃŒ ๋˜๋Š” graceful shutdown)"
elif [[ ${EXIT_CODE} -eq 143 ]]; then
    echo "  Status    : TERMINATED (SIGTERM โ€” ๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ๋จ)"
elif [[ ${EXIT_CODE} -eq 137 ]]; then
    echo "  Status    : KILLED (SIGKILL โ€” ๊ฐ•์ œ ์ข…๋ฃŒ, ์ฒดํฌํฌ์ธํŠธ ๋ฏธ์ €์žฅ)"
elif [[ ${EXIT_CODE} -eq 1 ]]; then
    echo "  Status    : ERROR (${LOG_FILE} ํ™•์ธ ํ•„์š”)"
else
    echo "  Status    : FAILED (exit code ${EXIT_CODE}, ${LOG_FILE} ํ™•์ธ)"
fi
echo "=================================================================="
exit ${EXIT_CODE}