frankenstallm / source /scripts /launch_3b_pretrain.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
# =============================================================================
# launch_3b_pretrain.sh โ€” 8-GPU FP8 pretraining launcher for Korean 3B LLM
#
# Features:
# - SIGHUP ๋ฐฉ์–ด: SSH ๋Š๊น€ ์‹œ ์ž๋™์œผ๋กœ nohup+setsid๋กœ ์„ธ์…˜ ๋ณดํ˜ธ
# - Graceful shutdown: SIGTERM ์‹œ Python ์‹œ๊ทธ๋„ ํ•ธ๋“ค๋Ÿฌ๊ฐ€ ๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ
# - ์ž๋™ resume: ์ตœ์‹  ์ฒดํฌํฌ์ธํŠธ์—์„œ ์ž๋™ ์žฌ๊ฐœ
# - PID ํŒŒ์ผ: ํ”„๋กœ์„ธ์Šค ๋ชจ๋‹ˆํ„ฐ๋ง ๋ฐ ์ œ์–ด์šฉ
# - grep ํŒŒ์ดํ”„๋ผ์ธ exit code ๋ณดํ˜ธ (|| true)
#
# Usage:
# bash scripts/launch_3b_pretrain.sh # full run (60B tokens)
# bash scripts/launch_3b_pretrain.sh --max_steps 500 # quick test
# bash scripts/launch_3b_pretrain.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-0010000
# MAX_STEPS=95000 bash scripts/launch_3b_pretrain.sh # 100B tokens
#
# ๋ชจ๋‹ˆํ„ฐ๋ง:
# tail -f checkpoints/korean_3b_fp8_run1/train.log
# cat checkpoints/korean_3b_fp8_run1/train.pid
#
# ์ค‘์ง€ (๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ์ €์žฅ):
# kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)
#
# ๊ฐ•์ œ ์ข…๋ฃŒ (์ฒดํฌํฌ์ธํŠธ ์ €์žฅ ์—†์Œ):
# kill -9 $(cat checkpoints/korean_3b_fp8_run1/train.pid)
# =============================================================================
# -u: ๋ฏธ์ •์˜ ๋ณ€์ˆ˜ ์—๋Ÿฌ
# NOTE: -e, -o pipefail ์˜๋„์  ์ œ๊ฑฐ
# ์ด์ „ ๋ฌธ์ œ: grep ํŒŒ์ดํ”„๋ผ์ธ์—์„œ ๋ชจ๋“  ๋ผ์ธ์ด ํ•„ํ„ฐ๋ง๋˜๋ฉด exit code 1 ๋ฐ˜ํ™˜
# โ†’ pipefail์ด ์ด๋ฅผ ์Šคํฌ๋ฆฝํŠธ ์‹คํŒจ๋กœ ์ „ํŒŒ โ†’ ํ•™์Šต ์ค‘๋‹จ
# ํ•ด๊ฒฐ: set -e/pipefail ์ œ๊ฑฐ + grep ์ฒด์ธ์— || true ์ถ”๊ฐ€
set -u
# ---- Configurable defaults --------------------------------------------------
RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
CONFIG="${CONFIG:-configs/korean_3b_fp8.yaml}"
TRAIN_DATA="${TRAIN_DATA:-data/3b_train.bin}"
VAL_DATA="${VAL_DATA:-data/3b_val.bin}"
CKPT_DIR="checkpoints/${RUN_NAME}"
LOG_FILE="${CKPT_DIR}/train.log"
NPROC=8
MASTER_PORT="${MASTER_PORT:-29501}"
MAX_STEPS="${MAX_STEPS:-57000}"
BATCH_SIZE=5
GRAD_ACCUM=8
WARMUP_STEPS=2000
SEED=42
# ---- B200 / NVSwitch single-node NCCL tuning (3B optimized, v2) ----------
export NCCL_IB_DISABLE=1
export NCCL_ALGO=NVLS,Ring # NVSwitch hardware reduction first (was Ring,Tree)
export NCCL_PROTO=Simple
export NCCL_NVLS_ENABLE=1 # NVLink SHARP โ€” hardware-accelerated all-reduce
export NCCL_MIN_NCHANNELS=32 # raise minimum for NVSwitch headroom (was 16)
export NCCL_MAX_NCHANNELS=32
export NCCL_BUFFSIZE=268435456 # 256MB (was 128MB) โ€” reduces bucket pipeline stalls
export NCCL_P2P_LEVEL=NVL
export NCCL_NET_GDR_LEVEL=0
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Triton/Inductor cache on executable filesystem (not /tmp which is noexec)
export TRITON_CUDACRT_PATH=/usr/local/cuda/include
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
cd "$(dirname "$0")/.."
mkdir -p "${CKPT_DIR}"
# ---- Session protection (SIGHUP ๋ฐฉ์–ด) ---------------------------------------
# tmux/screen ์—†์ด ์‹คํ–‰ ์‹œ, ์ž๋™์œผ๋กœ nohup + setsid๋กœ ๋ž˜ํ•‘ํ•˜์—ฌ
# SSH ๋Š๊น€(SIGHUP)์œผ๋กœ๋ถ€ํ„ฐ ํ•™์Šต ํ”„๋กœ์„ธ์Šค๋ฅผ ๋ณดํ˜ธํ•ฉ๋‹ˆ๋‹ค.
#
# ์ž‘๋™ ์›๋ฆฌ:
# 1. tmux/screen/์ด๋ฏธ ๋ณดํ˜ธ๋จ ์—ฌ๋ถ€ ํ™•์ธ
# 2. ๋ฏธ๋ณดํ˜ธ ์ƒํƒœ์ด๋ฉด _LAUNCH_PROTECTED=1 ์„ค์ • ํ›„ nohup setsid๋กœ ์ž๊ธฐ ์ž์‹ ์„ ์žฌ์‹คํ–‰
# 3. ์žฌ์‹คํ–‰๋œ ํ”„๋กœ์„ธ์Šค๋Š” ์ƒˆ๋กœ์šด ์„ธ์…˜ ๋ฆฌ๋”๊ฐ€ ๋˜์–ด ํ„ฐ๋ฏธ๋„๊ณผ ๋ถ„๋ฆฌ๋จ
# 4. ์›๋ž˜ ์…ธ์€ PID์™€ ๋ชจ๋‹ˆํ„ฐ๋ง ๋ช…๋ น์„ ์ถœ๋ ฅํ•˜๊ณ  ์ฆ‰์‹œ ์ข…๋ฃŒ
PID_FILE="${CKPT_DIR}/train.pid"
if [[ -z "${_LAUNCH_PROTECTED:-}" ]] && [[ -z "${TMUX:-}" ]] && [[ -z "${STY:-}" ]]; then
export _LAUNCH_PROTECTED=1
NOHUP_LOG="${CKPT_DIR}/launch_$(date +%Y%m%d_%H%M%S).log"
echo "=================================================================="
echo " SIGHUP PROTECTION ACTIVATED"
echo " tmux/screen ๋ฏธ๊ฐ์ง€ โ†’ ์„ธ์…˜ ๋ณดํ˜ธ ๋ชจ๋“œ ์ž๋™ ํ™œ์„ฑํ™” (nohup + setsid)"
echo " SSH ๋Š์–ด์ ธ๋„ ํ•™์Šต์ด ๊ณ„์†๋ฉ๋‹ˆ๋‹ค."
echo "=================================================================="
echo ""
# ์ž๊ธฐ ์ž์‹ ์„ ์„ธ์…˜ ๋ณดํ˜ธ ๋ชจ๋“œ๋กœ ์žฌ์‹คํ–‰
nohup setsid bash "$0" "$@" > "${NOHUP_LOG}" 2>&1 &
BG_PID=$!
echo "${BG_PID}" > "${PID_FILE}"
echo " PID : ${BG_PID}"
echo " PID ํŒŒ์ผ : ${PID_FILE}"
echo " Launch ๋กœ๊ทธ : ${NOHUP_LOG}"
echo " ํ•™์Šต ๋กœ๊ทธ : ${LOG_FILE}"
echo ""
echo " ๋ชจ๋‹ˆํ„ฐ๋ง:"
echo " tail -f ${LOG_FILE}"
echo ""
echo " ์ค‘์ง€ (๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ์ €์žฅ):"
echo " kill \$(cat ${PID_FILE})"
echo ""
echo " ๊ฐ•์ œ ์ข…๋ฃŒ:"
echo " kill -9 \$(cat ${PID_FILE})"
echo "=================================================================="
exit 0
fi
# ---- Cleanup on exit --------------------------------------------------------
PREWARM_PID=""
cleanup() {
rm -f "${PID_FILE}" 2>/dev/null || true
if [[ -n "${PREWARM_PID:-}" ]]; then
kill "${PREWARM_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
# PID ํŒŒ์ผ ๊ธฐ๋ก (tmux/screen ๋‚ด์—์„œ ์‹คํ–‰ ์‹œ์—๋„ PID ์ถ”์  ๊ฐ€๋Šฅ)
echo "$$" > "${PID_FILE}"
# ---- Pre-flight checks ------------------------------------------------------
if [[ ! -f "${CONFIG}" ]]; then
echo "[ERROR] Config not found: ${CONFIG}"
exit 1
fi
if [[ ! -f "${TRAIN_DATA}" ]]; then
echo "[ERROR] Training data not found: ${TRAIN_DATA}"
exit 1
fi
# GPU ๋ฉ”๋ชจ๋ฆฌ ์ฒดํฌ (3B๋Š” ์ตœ์†Œ 80GB/GPU ๊ถŒ์žฅ, B200=192GB โ†’ OK)
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 80000 ]]; then
echo "[WARN] GPU memory ${GPU_MEM}MB < 80GB. 3B ํ•™์Šต์— ๋ถ€์กฑํ•  ์ˆ˜ ์žˆ์Œ."
fi
# ์ค‘๋ณต ํ”„๋กœ์„ธ์Šค ๋ฐฉ์ง€
EXISTING_PID=$(pgrep -f "pretrain.py.*korean_3b" 2>/dev/null | head -1 || true)
if [[ -n "$EXISTING_PID" ]]; then
echo "[ERROR] ์ด๋ฏธ 3B pretrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰ ์ค‘ (PID: ${EXISTING_PID})"
echo " kill ${EXISTING_PID} ๋กœ ๋จผ์ € ์ข…๋ฃŒํ•˜์„ธ์š”."
exit 1
fi
# ๋””์Šคํฌ ์—ฌ์œ  ํ™•์ธ (์ตœ์†Œ 1TB ํ•„์š”)
AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}')
if [[ -n "${AVAIL_KB:-}" ]] && [[ "$AVAIL_KB" -lt 1073741824 ]]; then
AVAIL_TB=$(echo "scale=1; $AVAIL_KB / 1073741824" | bc 2>/dev/null || echo "?")
echo "[WARN] /PROJECT ์—ฌ์œ  ${AVAIL_TB}TB < 1TB. ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ ๊ณต๊ฐ„ ๋ถ€์กฑ ๊ฐ€๋Šฅ."
fi
# ---- Resume detection -------------------------------------------------------
RESUME_ARG=""
EXTRA_ARGS="${*:-}"
if [[ ! "${EXTRA_ARGS}" =~ "--resume" ]]; then
# ๊ฐ€์žฅ ์ตœ๊ทผ ์ฒดํฌํฌ์ธํŠธ ์ž๋™ ๊ฐ์ง€
LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
if [[ -n "$LATEST_CKPT" ]]; then
echo "[INFO] ์ž๋™ resume ๊ฐ์ง€: ${LATEST_CKPT}"
RESUME_ARG="--resume ${LATEST_CKPT}"
fi
fi
# ---- Banner ------------------------------------------------------------------
SESSION_TYPE="direct"
[[ -n "${TMUX:-}" ]] && SESSION_TYPE="tmux"
[[ -n "${STY:-}" ]] && SESSION_TYPE="screen"
[[ -n "${_LAUNCH_PROTECTED:-}" ]] && SESSION_TYPE="protected (nohup+setsid)"
echo "=================================================================="
echo " Korean 3B LLM Pre-Training (FP8)"
echo " Run name : ${RUN_NAME}"
echo " Config : ${CONFIG}"
echo " CKPT dir : ${CKPT_DIR}"
echo " Log file : ${LOG_FILE}"
echo " Max steps : ${MAX_STEPS}"
echo " Batch : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} accum"
echo " Eff tokens : $((BATCH_SIZE * NPROC * GRAD_ACCUM * 4096)) tokens/step (~1M)"
echo " Total tokens: ~$((MAX_STEPS * BATCH_SIZE * NPROC * GRAD_ACCUM * 4096 / 1000000000))B"
echo " Resume : ${RESUME_ARG:-none (fresh start)}"
echo " Session : ${SESSION_TYPE}"
echo " PID : $$ (file: ${PID_FILE})"
echo " Started : $(date)"
echo "=================================================================="
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
# ---- Pre-warm OS page cache (NUMA-interleaved, non-blocking) ---------------
if [[ -f "${TRAIN_DATA}" ]]; then
echo "[INFO] Pre-warming page cache for ${TRAIN_DATA} (NUMA interleaved)..."
numactl --interleave=all dd if="${TRAIN_DATA}" of=/dev/null bs=16M 2>/dev/null &
PREWARM_PID=$!
fi
# ---- Launch training ---------------------------------------------------------
# grep ํŒŒ์ดํ”„๋ผ์ธ ๋ณดํ˜ธ:
# ๋ฌธ์ œ: grep -v ๊ฐ€ ๋งค์นญ ๋ผ์ธ์ด ์—†์œผ๋ฉด exit code 1 ๋ฐ˜ํ™˜
# ํ•ด๊ฒฐ: { ... || true; } ๋ž˜ํ•‘์œผ๋กœ ํŒŒ์ดํ”„๋ผ์ธ exit code๋ฅผ ํ•ญ์ƒ 0์œผ๋กœ ๋ณด์žฅ
# torchrun์˜ ์‹ค์ œ exit code๋Š” PIPESTATUS[0]์œผ๋กœ ๋ณ„๋„ ์บก์ฒ˜
numactl --interleave=all \
torchrun \
--nproc_per_node=${NPROC} \
--master_port=${MASTER_PORT} \
train/pretrain.py \
--config "${CONFIG}" \
--train_data "${TRAIN_DATA}" \
--val_data "${VAL_DATA}" \
--checkpoint_dir "${CKPT_DIR}" \
--log_file "${LOG_FILE}" \
--max_steps ${MAX_STEPS} \
--batch_size ${BATCH_SIZE} \
--grad_accum ${GRAD_ACCUM} \
--warmup_steps ${WARMUP_STEPS} \
--seed ${SEED} \
${RESUME_ARG} \
${EXTRA_ARGS} \
2>&1 | { grep -v "UserWarning" \
| grep -v "Warning only once" \
| grep -v "Overriding a previously" \
| grep -v "dispatch key:" \
| grep -v "previous kernel:" \
| grep -v "new kernel:" \
| grep -v "operator: flash_attn" \
| grep -v "registered at /usr/local" \
| grep -v "self.m.impl" \
|| true; }
EXIT_CODE=${PIPESTATUS[0]}
# ---- Exit summary ------------------------------------------------------------
echo ""
echo "=================================================================="
echo " Finished : $(date)"
echo " Exit code : ${EXIT_CODE}"
if [[ ${EXIT_CODE} -eq 0 ]]; then
echo " Status : SUCCESS (ํ•™์Šต ์™„๋ฃŒ ๋˜๋Š” graceful shutdown)"
elif [[ ${EXIT_CODE} -eq 143 ]]; then
echo " Status : TERMINATED (SIGTERM โ€” ๋น„์ƒ ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ๋จ)"
elif [[ ${EXIT_CODE} -eq 137 ]]; then
echo " Status : KILLED (SIGKILL โ€” ๊ฐ•์ œ ์ข…๋ฃŒ, ์ฒดํฌํฌ์ธํŠธ ๋ฏธ์ €์žฅ)"
elif [[ ${EXIT_CODE} -eq 1 ]]; then
echo " Status : ERROR (${LOG_FILE} ํ™•์ธ ํ•„์š”)"
else
echo " Status : FAILED (exit code ${EXIT_CODE}, ${LOG_FILE} ํ™•์ธ)"
fi
echo "=================================================================="
exit ${EXIT_CODE}