| | #!/usr/bin/env bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | set -u |
| |
|
| | |
| | RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}" |
| | CONFIG="${CONFIG:-configs/korean_3b_fp8.yaml}" |
| | TRAIN_DATA="${TRAIN_DATA:-data/3b_train.bin}" |
| | VAL_DATA="${VAL_DATA:-data/3b_val.bin}" |
| | CKPT_DIR="checkpoints/${RUN_NAME}" |
| | LOG_FILE="${CKPT_DIR}/train.log" |
| | NPROC=8 |
| | MASTER_PORT="${MASTER_PORT:-29501}" |
| |
|
| | MAX_STEPS="${MAX_STEPS:-57000}" |
| | BATCH_SIZE=5 |
| | GRAD_ACCUM=8 |
| | WARMUP_STEPS=2000 |
| | SEED=42 |
| |
|
| | |
| | export NCCL_IB_DISABLE=1 |
| | export NCCL_ALGO=NVLS,Ring |
| | export NCCL_PROTO=Simple |
| | export NCCL_NVLS_ENABLE=1 |
| | export NCCL_MIN_NCHANNELS=32 |
| | export NCCL_MAX_NCHANNELS=32 |
| | export NCCL_BUFFSIZE=268435456 |
| | export NCCL_P2P_LEVEL=NVL |
| | export NCCL_NET_GDR_LEVEL=0 |
| | export OMP_NUM_THREADS=4 |
| | export MKL_NUM_THREADS=4 |
| | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| | |
| | export TRITON_CUDACRT_PATH=/usr/local/cuda/include |
| | export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas |
| |
|
| | cd "$(dirname "$0")/.." |
| |
|
| | mkdir -p "${CKPT_DIR}" |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | PID_FILE="${CKPT_DIR}/train.pid" |
| |
|
| | if [[ -z "${_LAUNCH_PROTECTED:-}" ]] && [[ -z "${TMUX:-}" ]] && [[ -z "${STY:-}" ]]; then |
| | export _LAUNCH_PROTECTED=1 |
| | NOHUP_LOG="${CKPT_DIR}/launch_$(date +%Y%m%d_%H%M%S).log" |
| |
|
| | echo "==================================================================" |
| | echo " SIGHUP PROTECTION ACTIVATED" |
| | echo " tmux/screen ๋ฏธ๊ฐ์ง โ ์ธ์
๋ณดํธ ๋ชจ๋ ์๋ ํ์ฑํ (nohup + setsid)" |
| | echo " SSH ๋์ด์ ธ๋ ํ์ต์ด ๊ณ์๋ฉ๋๋ค." |
| | echo "==================================================================" |
| | echo "" |
| |
|
| | |
| | nohup setsid bash "$0" "$@" > "${NOHUP_LOG}" 2>&1 & |
| | BG_PID=$! |
| | echo "${BG_PID}" > "${PID_FILE}" |
| |
|
| | echo " PID : ${BG_PID}" |
| | echo " PID ํ์ผ : ${PID_FILE}" |
| | echo " Launch ๋ก๊ทธ : ${NOHUP_LOG}" |
| | echo " ํ์ต ๋ก๊ทธ : ${LOG_FILE}" |
| | echo "" |
| | echo " ๋ชจ๋ํฐ๋ง:" |
| | echo " tail -f ${LOG_FILE}" |
| | echo "" |
| | echo " ์ค์ง (๋น์ ์ฒดํฌํฌ์ธํธ ์๋ ์ ์ฅ):" |
| | echo " kill \$(cat ${PID_FILE})" |
| | echo "" |
| | echo " ๊ฐ์ ์ข
๋ฃ:" |
| | echo " kill -9 \$(cat ${PID_FILE})" |
| | echo "==================================================================" |
| | exit 0 |
| | fi |
| |
|
| | |
| | PREWARM_PID="" |
| |
|
| | cleanup() { |
| | rm -f "${PID_FILE}" 2>/dev/null || true |
| | if [[ -n "${PREWARM_PID:-}" ]]; then |
| | kill "${PREWARM_PID}" 2>/dev/null || true |
| | fi |
| | } |
| | trap cleanup EXIT |
| |
|
| | |
| | echo "$$" > "${PID_FILE}" |
| |
|
| | |
| | if [[ ! -f "${CONFIG}" ]]; then |
| | echo "[ERROR] Config not found: ${CONFIG}" |
| | exit 1 |
| | fi |
| |
|
| | if [[ ! -f "${TRAIN_DATA}" ]]; then |
| | echo "[ERROR] Training data not found: ${TRAIN_DATA}" |
| | exit 1 |
| | fi |
| |
|
| | |
| | GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0") |
| | if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 80000 ]]; then |
| | echo "[WARN] GPU memory ${GPU_MEM}MB < 80GB. 3B ํ์ต์ ๋ถ์กฑํ ์ ์์." |
| | fi |
| |
|
| | |
| | EXISTING_PID=$(pgrep -f "pretrain.py.*korean_3b" 2>/dev/null | head -1 || true) |
| | if [[ -n "$EXISTING_PID" ]]; then |
| | echo "[ERROR] ์ด๋ฏธ 3B pretrain ํ๋ก์ธ์ค ์คํ ์ค (PID: ${EXISTING_PID})" |
| | echo " kill ${EXISTING_PID} ๋ก ๋จผ์ ์ข
๋ฃํ์ธ์." |
| | exit 1 |
| | fi |
| |
|
| | |
| | AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}') |
| | if [[ -n "${AVAIL_KB:-}" ]] && [[ "$AVAIL_KB" -lt 1073741824 ]]; then |
| | AVAIL_TB=$(echo "scale=1; $AVAIL_KB / 1073741824" | bc 2>/dev/null || echo "?") |
| | echo "[WARN] /PROJECT ์ฌ์ ${AVAIL_TB}TB < 1TB. ์ฒดํฌํฌ์ธํธ ์ ์ฅ ๊ณต๊ฐ ๋ถ์กฑ ๊ฐ๋ฅ." |
| | fi |
| |
|
| | |
| | RESUME_ARG="" |
| | EXTRA_ARGS="${*:-}" |
| | if [[ ! "${EXTRA_ARGS}" =~ "--resume" ]]; then |
| | |
| | LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true) |
| | if [[ -n "$LATEST_CKPT" ]]; then |
| | echo "[INFO] ์๋ resume ๊ฐ์ง: ${LATEST_CKPT}" |
| | RESUME_ARG="--resume ${LATEST_CKPT}" |
| | fi |
| | fi |
| |
|
| | |
| | SESSION_TYPE="direct" |
| | [[ -n "${TMUX:-}" ]] && SESSION_TYPE="tmux" |
| | [[ -n "${STY:-}" ]] && SESSION_TYPE="screen" |
| | [[ -n "${_LAUNCH_PROTECTED:-}" ]] && SESSION_TYPE="protected (nohup+setsid)" |
| |
|
| | echo "==================================================================" |
| | echo " Korean 3B LLM Pre-Training (FP8)" |
| | echo " Run name : ${RUN_NAME}" |
| | echo " Config : ${CONFIG}" |
| | echo " CKPT dir : ${CKPT_DIR}" |
| | echo " Log file : ${LOG_FILE}" |
| | echo " Max steps : ${MAX_STEPS}" |
| | echo " Batch : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} accum" |
| | echo " Eff tokens : $((BATCH_SIZE * NPROC * GRAD_ACCUM * 4096)) tokens/step (~1M)" |
| | echo " Total tokens: ~$((MAX_STEPS * BATCH_SIZE * NPROC * GRAD_ACCUM * 4096 / 1000000000))B" |
| | echo " Resume : ${RESUME_ARG:-none (fresh start)}" |
| | echo " Session : ${SESSION_TYPE}" |
| | echo " PID : $$ (file: ${PID_FILE})" |
| | echo " Started : $(date)" |
| | echo "==================================================================" |
| |
|
| | export PYTHONWARNINGS="ignore::UserWarning:torch.library" |
| |
|
| | |
| | if [[ -f "${TRAIN_DATA}" ]]; then |
| | echo "[INFO] Pre-warming page cache for ${TRAIN_DATA} (NUMA interleaved)..." |
| | numactl --interleave=all dd if="${TRAIN_DATA}" of=/dev/null bs=16M 2>/dev/null & |
| | PREWARM_PID=$! |
| | fi |
| |
|
| | |
| | |
| | |
| | |
| | |
| | numactl --interleave=all \ |
| | torchrun \ |
| | --nproc_per_node=${NPROC} \ |
| | --master_port=${MASTER_PORT} \ |
| | train/pretrain.py \ |
| | --config "${CONFIG}" \ |
| | --train_data "${TRAIN_DATA}" \ |
| | --val_data "${VAL_DATA}" \ |
| | --checkpoint_dir "${CKPT_DIR}" \ |
| | --log_file "${LOG_FILE}" \ |
| | --max_steps ${MAX_STEPS} \ |
| | --batch_size ${BATCH_SIZE} \ |
| | --grad_accum ${GRAD_ACCUM} \ |
| | --warmup_steps ${WARMUP_STEPS} \ |
| | --seed ${SEED} \ |
| | ${RESUME_ARG} \ |
| | ${EXTRA_ARGS} \ |
| | 2>&1 | { grep -v "UserWarning" \ |
| | | grep -v "Warning only once" \ |
| | | grep -v "Overriding a previously" \ |
| | | grep -v "dispatch key:" \ |
| | | grep -v "previous kernel:" \ |
| | | grep -v "new kernel:" \ |
| | | grep -v "operator: flash_attn" \ |
| | | grep -v "registered at /usr/local" \ |
| | | grep -v "self.m.impl" \ |
| | || true; } |
| |
|
| | EXIT_CODE=${PIPESTATUS[0]} |
| |
|
| | |
| | echo "" |
| | echo "==================================================================" |
| | echo " Finished : $(date)" |
| | echo " Exit code : ${EXIT_CODE}" |
| | if [[ ${EXIT_CODE} -eq 0 ]]; then |
| | echo " Status : SUCCESS (ํ์ต ์๋ฃ ๋๋ graceful shutdown)" |
| | elif [[ ${EXIT_CODE} -eq 143 ]]; then |
| | echo " Status : TERMINATED (SIGTERM โ ๋น์ ์ฒดํฌํฌ์ธํธ ์ ์ฅ๋จ)" |
| | elif [[ ${EXIT_CODE} -eq 137 ]]; then |
| | echo " Status : KILLED (SIGKILL โ ๊ฐ์ ์ข
๋ฃ, ์ฒดํฌํฌ์ธํธ ๋ฏธ์ ์ฅ)" |
| | elif [[ ${EXIT_CODE} -eq 1 ]]; then |
| | echo " Status : ERROR (${LOG_FILE} ํ์ธ ํ์)" |
| | else |
| | echo " Status : FAILED (exit code ${EXIT_CODE}, ${LOG_FILE} ํ์ธ)" |
| | fi |
| | echo "==================================================================" |
| | exit ${EXIT_CODE} |
| |
|