frankenstallm / source /scripts /launch_3b_orpo.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
# =============================================================================
# launch_3b_orpo.sh โ€” 8-GPU ORPO fine-tuning launcher for Korean 3B LLM
#
# Usage:
# bash scripts/launch_3b_orpo.sh # ๊ธฐ๋ณธ ์‹คํ–‰
# bash scripts/launch_3b_orpo.sh --max_steps 200 # ๋น ๋ฅธ ํ…Œ์ŠคํŠธ
# RUN_NAME=my_orpo bash scripts/launch_3b_orpo.sh # ์ด๋ฆ„ ์ง€์ •
#
# ๊ธฐ๋ฐ˜ ๋ชจ๋ธ : eval/outputs/hf_3b_sft_best (SFT v1 best)
# ๋ฐ์ดํ„ฐ : data/preference/combined_preference.jsonl
# ์ถœ๋ ฅ : checkpoints/korean_3b_orpo_v1/
# ๋กœ๊ทธ : checkpoints/korean_3b_orpo_v1/train.log
#
# ์ฒดํฌํฌ์ธํŠธ ํฌ๊ธฐ ์˜ˆ์ƒ:
# model weights: ~6GB (bf16)
# optimizer states: ~24GB
# ์ด ~30GB/๊ฐœ ร— max 5๊ฐœ = 150GB
# =============================================================================
set -euo pipefail
# ---- Configurable defaults --------------------------------------------------
RUN_NAME="${RUN_NAME:-korean_3b_orpo_v1}"
BASE_MODEL="${BASE_MODEL:-eval/outputs/hf_3b_sft_best}"
DATA_PATH="${DATA_PATH:-data/preference/combined_preference.jsonl}"
OUTPUT_DIR="checkpoints/${RUN_NAME}"
CKPT_DIR="checkpoints/${RUN_NAME}"
LOG_FILE="${CKPT_DIR}/train.log"
NPROC=8
MASTER_PORT="${MASTER_PORT:-29502}"
# ORPO ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ
BATCH_SIZE=4
GRAD_ACCUM=4
LR=1.2e-5
BETA=0.25
EPOCHS=2
MAX_LENGTH=1536
WARMUP_RATIO=0.05
WEIGHT_DECAY=0.01
EVAL_SPLIT_RATIO=0.05
EVAL_STEPS=500
EARLY_STOPPING_PATIENCE=3
SAVE_TOTAL_LIMIT=5
SEED=42
EXTRA_ARGS="$@"
# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
# (launch_3b_pretrain.sh์™€ ๋™์ผํ•œ NCCL ์„ค์ • ์œ ์ง€)
export NCCL_IB_DISABLE=1
export NCCL_PROTO=Simple
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
# ORPO forward-backward ํŒจ์Šค๋Š” pretrain๋ณด๋‹ค ๋ฉ”๋ชจ๋ฆฌ ๋ณ€๋™์ด ํฌ๋ฏ€๋กœ ๋ฒ„ํผ 128MB ์œ ์ง€
export NCCL_BUFFSIZE=134217728
export OMP_NUM_THREADS=9
export MKL_NUM_THREADS=9
# OOM ๋ฐฉ์ง€: ๋ฉ”๋ชจ๋ฆฌ ๋‹จํŽธํ™” ์™„ํ™” (ORPO๋Š” chosen/rejected ๋™์‹œ forward โ†’ ๋ฉ”๋ชจ๋ฆฌ ๋ฏผ๊ฐ)
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# P2P NVLink ์ง์ ‘ ํ†ต์‹  ํ™œ์„ฑํ™”
export NCCL_P2P_LEVEL=NVL
# Ring + Tree ๋ณ‘ํ–‰ (3B gradient ํฌ๊ธฐ ๊ธฐ์ค€)
export NCCL_ALGO=Ring,Tree
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
cd "$(dirname "$0")/.."
# ---- Pre-flight checks ------------------------------------------------------
if [[ ! -d "${BASE_MODEL}" ]]; then
echo "ERROR: ๊ธฐ๋ฐ˜ ๋ชจ๋ธ ๋””๋ ‰ํ† ๋ฆฌ ์—†์Œ: ${BASE_MODEL}"
echo " SFT ์™„๋ฃŒ ํ›„ HF ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ํ–ˆ๋Š”์ง€ ํ™•์ธํ•˜์„ธ์š”."
echo " ์˜ˆ: python scripts/convert_to_hf.py --checkpoint <sft_ckpt> --output ${BASE_MODEL}"
exit 1
fi
if [[ ! -f "${DATA_PATH}" ]]; then
echo "ERROR: ํ•™์Šต ๋ฐ์ดํ„ฐ ์—†์Œ: ${DATA_PATH}"
echo " ๋จผ์ € ๋ฐ์ดํ„ฐ ํ†ตํ•ฉ ์Šคํฌ๋ฆฝํŠธ๋ฅผ ์‹คํ–‰ํ•˜์„ธ์š”:"
echo " python data/prepare_preference_combined.py"
exit 1
fi
if [[ ! -f "train/orpo.py" ]]; then
echo "ERROR: train/orpo.py ์—†์Œ"
exit 1
fi
# GPU ๋ฉ”๋ชจ๋ฆฌ ์ฒดํฌ
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 40000 ]]; then
echo "WARNING: GPU ๋ฉ”๋ชจ๋ฆฌ ${GPU_MEM}MB < 40GB. ORPO 3B ํ•™์Šต์— ๋ถ€์กฑํ•  ์ˆ˜ ์žˆ์Œ."
fi
# ์ค‘๋ณต ํ”„๋กœ์„ธ์Šค ๋ฐฉ์ง€
EXISTING_PID=$(pgrep -f "orpo.py.*${RUN_NAME}" 2>/dev/null | head -1 || true)
if [[ -n "$EXISTING_PID" ]]; then
echo "ERROR: ์ด๋ฏธ ORPO ํ”„๋กœ์„ธ์Šค ์‹คํ–‰ ์ค‘ (PID: ${EXISTING_PID})"
echo " kill ${EXISTING_PID} ๋กœ ๋จผ์ € ์ข…๋ฃŒํ•˜์„ธ์š”."
exit 1
fi
# ๋””์Šคํฌ ์—ฌ์œ  ํ™•์ธ (์ตœ์†Œ 200GB)
AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}' || echo "0")
if [[ -n "$AVAIL_KB" && "$AVAIL_KB" -gt 0 && "$AVAIL_KB" -lt 209715200 ]]; then
AVAIL_GB=$(echo "scale=1; $AVAIL_KB / 1048576" | bc 2>/dev/null || echo "?")
echo "WARNING: /PROJECT ์—ฌ์œ  ${AVAIL_GB}GB < 200GB. ์ฒดํฌํฌ์ธํŠธ ์ €์žฅ ๊ณต๊ฐ„ ๋ถ€์กฑ ๊ฐ€๋Šฅ."
fi
mkdir -p "${CKPT_DIR}" "${OUTPUT_DIR}"
# ---- ๋ฐ์ดํ„ฐ ๋ ˆ์ฝ”๋“œ ์ˆ˜ ํ™•์ธ --------------------------------------------------
DATA_LINES=$(wc -l < "${DATA_PATH}" 2>/dev/null || echo "?")
echo " ํ•™์Šต ๋ฐ์ดํ„ฐ ๋ ˆ์ฝ”๋“œ ์ˆ˜: ${DATA_LINES}"
# ---- ์œ ํšจ ๋ฐฐ์น˜ ํฌ๊ธฐ ๊ณ„์‚ฐ ----------------------------------------------------
EFF_BATCH=$((BATCH_SIZE * NPROC * GRAD_ACCUM))
echo "=================================================================="
echo " Korean 3B LLM ORPO Fine-Tuning"
echo " Run name : ${RUN_NAME}"
echo " Base model : ${BASE_MODEL}"
echo " Data : ${DATA_PATH} (${DATA_LINES} records)"
echo " Output dir : ${OUTPUT_DIR}"
echo " CKPT dir : ${CKPT_DIR}"
echo " Log file : ${LOG_FILE}"
echo " Epochs : ${EPOCHS}"
echo " LR : ${LR}"
echo " Beta (ORPO) : ${BETA}"
echo " Batch : ${BATCH_SIZE} (local) ร— ${NPROC} GPU ร— ${GRAD_ACCUM} accum = ${EFF_BATCH}"
echo " Max length : ${MAX_LENGTH}"
echo " Weight decay : ${WEIGHT_DECAY}"
echo " Eval steps : ${EVAL_STEPS}"
echo " Early stop : patience=${EARLY_STOPPING_PATIENCE}"
echo " Started : $(date)"
echo "=================================================================="
torchrun \
--nproc_per_node=${NPROC} \
--master_port=${MASTER_PORT} \
train/orpo.py \
--model_path "${BASE_MODEL}" \
--custom_data_path "${DATA_PATH}" \
--output_dir "${OUTPUT_DIR}" \
--epochs ${EPOCHS} \
--lr ${LR} \
--beta ${BETA} \
--batch_size ${BATCH_SIZE} \
--gradient_accumulation_steps ${GRAD_ACCUM} \
--max_length ${MAX_LENGTH} \
--weight_decay ${WEIGHT_DECAY} \
--eval_split_ratio ${EVAL_SPLIT_RATIO} \
--eval_steps ${EVAL_STEPS} \
--early_stopping_patience ${EARLY_STOPPING_PATIENCE} \
--save_total_limit ${SAVE_TOTAL_LIMIT} \
${EXTRA_ARGS} \
2>&1 | tee "${LOG_FILE}" \
| grep -v "UserWarning" \
| grep -v "Warning only once" \
| grep -v "Overriding a previously" \
| grep -v "dispatch key:" \
| grep -v "previous kernel:" \
| grep -v "new kernel:" \
| grep -v "operator: flash_attn" \
| grep -v "registered at /usr/local" \
| grep -v "self.m.impl"
EXIT_CODE=$?
echo "=================================================================="
echo " Done : $(date)"
echo " Exit code: ${EXIT_CODE}"
if [[ "${EXIT_CODE}" -eq 0 ]]; then
echo " ๋ชจ๋ธ ์ €์žฅ ์œ„์น˜: ${OUTPUT_DIR}"
fi
echo "=================================================================="
exit $EXIT_CODE