File size: 3,824 Bytes
48ecd01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env bash
# =============================================================================
# launch_korean_3b.sh โ€” 8-GPU FP8 pretraining launcher for 3B Korean LLM
#
# Usage:
#   bash scripts/launch_korean_3b.sh                     # full run (~60B tokens)
#   bash scripts/launch_korean_3b.sh --max_steps 50      # quick benchmark
#   bash scripts/launch_korean_3b.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
#
# Effective batch size: 8 (local) ร— 8 GPU ร— 4 (grad_accum) ร— 4096 (seq_len)
#                     = 1,048,576 tokens / step
# =============================================================================
set -euo pipefail

RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
CONFIG="${CONFIG:-configs/3b_pretrain.yaml}"
CKPT_DIR="checkpoints/${RUN_NAME}"
LOG_FILE="${CKPT_DIR}/train.log"
NPROC=8
MASTER_PORT="${MASTER_PORT:-29502}"

MAX_STEPS=57000
BATCH_SIZE=4
GRAD_ACCUM=8
LR=1.5e-4
WARMUP_STEPS=2000
SEED=42

EXTRA_ARGS="$@"

# ---- B200 / NVSwitch NCCL tuning -------------------------------------------
export NCCL_IB_DISABLE=1
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export NCCL_BUFFSIZE=67108864
export OMP_NUM_THREADS=4
export MKL_NUM_THREADS=4

# cd FIRST โ€” ์ดํ›„ ์ƒ๋Œ€๊ฒฝ๋กœ ์ฒดํฌ๊ฐ€ ํ”„๋กœ์ ํŠธ ๋ฃจํŠธ ๊ธฐ์ค€์œผ๋กœ ๋™์ž‘
cd "$(dirname "$0")/.."

# TRAIN_DATA fallback: cd ์ดํ›„์— ์ƒ๋Œ€๊ฒฝ๋กœ ์ฒดํฌ
if [[ -f "data/merged_3b_train.bin" ]]; then
    TRAIN_DATA="${TRAIN_DATA:-data/merged_3b_train.bin}"
    echo "Using merged training data: data/merged_3b_train.bin"
elif [[ -f "data/korean_train.bin" ]]; then
    TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
    echo "Using fallback training data: data/korean_train.bin"
else
    echo "ERROR: No training data found (data/merged_3b_train.bin or data/korean_train.bin)"
    exit 1
fi

# VAL_DATA fallback: cd ์ดํ›„์— ์ƒ๋Œ€๊ฒฝ๋กœ ์ฒดํฌ
VAL_DATA="${VAL_DATA:-data/merged_3b_val.bin}"
if [[ ! -f "${VAL_DATA}" ]]; then
    VAL_DATA="data/korean_val.bin"
fi

if [[ ! -f "${TRAIN_DATA}" ]]; then
    echo "ERROR: Training data not found: ${TRAIN_DATA}"
    exit 1
fi
if [[ ! -f "${VAL_DATA}" ]]; then
    echo "ERROR: Validation data not found: ${VAL_DATA}"
    exit 1
fi

mkdir -p "${CKPT_DIR}"

echo "=================================================================="
echo "  Run name    : ${RUN_NAME}"
echo "  Config      : ${CONFIG}"
echo "  Train data  : ${TRAIN_DATA}"
echo "  CKPT dir    : ${CKPT_DIR}"
echo "  Max steps   : ${MAX_STEPS}"
echo "  LR          : ${LR}"
echo "  Batch size  : ${BATCH_SIZE} (local) ร— ${NPROC} GPU ร— ${GRAD_ACCUM} grad_accum"
echo "  Started     : $(date)"
echo "=================================================================="

export PYTHONWARNINGS="ignore::UserWarning:torch.library"

torchrun \
    --nproc_per_node=${NPROC} \
    --master_port=${MASTER_PORT} \
    train/pretrain.py \
    --config "${CONFIG}" \
    --train_data "${TRAIN_DATA}" \
    --val_data "${VAL_DATA}" \
    --checkpoint_dir "${CKPT_DIR}" \
    --log_file "${LOG_FILE}" \
    --max_steps ${MAX_STEPS} \
    --batch_size ${BATCH_SIZE} \
    --lr ${LR} \
    --grad_accum ${GRAD_ACCUM} \
    --warmup_steps ${WARMUP_STEPS} \
    --seed ${SEED} \
    ${EXTRA_ARGS} \
    2>&1 | grep -v "UserWarning" \
         | grep -v "Warning only once" \
         | grep -v "Overriding a previously" \
         | grep -v "dispatch key:" \
         | grep -v "previous kernel:" \
         | grep -v "new kernel:" \
         | grep -v "operator: flash_attn" \
         | grep -v "registered at /usr/local" \
         | grep -v "self.m.impl" \
         | tee -a "${LOG_FILE}"

echo "=================================================================="
echo "  Done : $(date)"
echo "=================================================================="