| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| |
| RUN_NAME="${RUN_NAME:-korean_3b_orpo_v1}" |
| BASE_MODEL="${BASE_MODEL:-eval/outputs/hf_3b_sft_best}" |
| DATA_PATH="${DATA_PATH:-data/preference/combined_preference.jsonl}" |
| OUTPUT_DIR="checkpoints/${RUN_NAME}" |
| CKPT_DIR="checkpoints/${RUN_NAME}" |
| LOG_FILE="${CKPT_DIR}/train.log" |
| NPROC=8 |
| MASTER_PORT="${MASTER_PORT:-29502}" |
|
|
| |
| BATCH_SIZE=4 |
| GRAD_ACCUM=4 |
| LR=1.2e-5 |
| BETA=0.25 |
| EPOCHS=2 |
| MAX_LENGTH=1536 |
| WARMUP_RATIO=0.05 |
| WEIGHT_DECAY=0.01 |
| EVAL_SPLIT_RATIO=0.05 |
| EVAL_STEPS=500 |
| EARLY_STOPPING_PATIENCE=3 |
| SAVE_TOTAL_LIMIT=5 |
| SEED=42 |
|
|
| EXTRA_ARGS="$@" |
|
|
| |
| |
| export NCCL_IB_DISABLE=1 |
| export NCCL_PROTO=Simple |
| export NCCL_MIN_NCHANNELS=16 |
| export NCCL_MAX_NCHANNELS=16 |
| |
| export NCCL_BUFFSIZE=134217728 |
| export OMP_NUM_THREADS=9 |
| export MKL_NUM_THREADS=9 |
| |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| |
| export NCCL_P2P_LEVEL=NVL |
| |
| export NCCL_ALGO=Ring,Tree |
|
|
| export PYTHONWARNINGS="ignore::UserWarning:torch.library" |
|
|
| cd "$(dirname "$0")/.." |
|
|
| |
| if [[ ! -d "${BASE_MODEL}" ]]; then |
| echo "ERROR: ๊ธฐ๋ฐ ๋ชจ๋ธ ๋๋ ํ ๋ฆฌ ์์: ${BASE_MODEL}" |
| echo " SFT ์๋ฃ ํ HF ํฌ๋งท์ผ๋ก ๋ณํํ๋์ง ํ์ธํ์ธ์." |
| echo " ์: python scripts/convert_to_hf.py --checkpoint <sft_ckpt> --output ${BASE_MODEL}" |
| exit 1 |
| fi |
|
|
| if [[ ! -f "${DATA_PATH}" ]]; then |
| echo "ERROR: ํ์ต ๋ฐ์ดํฐ ์์: ${DATA_PATH}" |
| echo " ๋จผ์ ๋ฐ์ดํฐ ํตํฉ ์คํฌ๋ฆฝํธ๋ฅผ ์คํํ์ธ์:" |
| echo " python data/prepare_preference_combined.py" |
| exit 1 |
| fi |
|
|
| if [[ ! -f "train/orpo.py" ]]; then |
| echo "ERROR: train/orpo.py ์์" |
| exit 1 |
| fi |
|
|
| |
| GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0") |
| if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 40000 ]]; then |
| echo "WARNING: GPU ๋ฉ๋ชจ๋ฆฌ ${GPU_MEM}MB < 40GB. ORPO 3B ํ์ต์ ๋ถ์กฑํ ์ ์์." |
| fi |
|
|
| |
| EXISTING_PID=$(pgrep -f "orpo.py.*${RUN_NAME}" 2>/dev/null | head -1 || true) |
| if [[ -n "$EXISTING_PID" ]]; then |
| echo "ERROR: ์ด๋ฏธ ORPO ํ๋ก์ธ์ค ์คํ ์ค (PID: ${EXISTING_PID})" |
| echo " kill ${EXISTING_PID} ๋ก ๋จผ์ ์ข
๋ฃํ์ธ์." |
| exit 1 |
| fi |
|
|
| |
| AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}' || echo "0") |
| if [[ -n "$AVAIL_KB" && "$AVAIL_KB" -gt 0 && "$AVAIL_KB" -lt 209715200 ]]; then |
| AVAIL_GB=$(echo "scale=1; $AVAIL_KB / 1048576" | bc 2>/dev/null || echo "?") |
| echo "WARNING: /PROJECT ์ฌ์ ${AVAIL_GB}GB < 200GB. ์ฒดํฌํฌ์ธํธ ์ ์ฅ ๊ณต๊ฐ ๋ถ์กฑ ๊ฐ๋ฅ." |
| fi |
|
|
| mkdir -p "${CKPT_DIR}" "${OUTPUT_DIR}" |
|
|
| |
| DATA_LINES=$(wc -l < "${DATA_PATH}" 2>/dev/null || echo "?") |
| echo " ํ์ต ๋ฐ์ดํฐ ๋ ์ฝ๋ ์: ${DATA_LINES}" |
|
|
| |
| EFF_BATCH=$((BATCH_SIZE * NPROC * GRAD_ACCUM)) |
|
|
| echo "==================================================================" |
| echo " Korean 3B LLM ORPO Fine-Tuning" |
| echo " Run name : ${RUN_NAME}" |
| echo " Base model : ${BASE_MODEL}" |
| echo " Data : ${DATA_PATH} (${DATA_LINES} records)" |
| echo " Output dir : ${OUTPUT_DIR}" |
| echo " CKPT dir : ${CKPT_DIR}" |
| echo " Log file : ${LOG_FILE}" |
| echo " Epochs : ${EPOCHS}" |
| echo " LR : ${LR}" |
| echo " Beta (ORPO) : ${BETA}" |
| echo " Batch : ${BATCH_SIZE} (local) ร ${NPROC} GPU ร ${GRAD_ACCUM} accum = ${EFF_BATCH}" |
| echo " Max length : ${MAX_LENGTH}" |
| echo " Weight decay : ${WEIGHT_DECAY}" |
| echo " Eval steps : ${EVAL_STEPS}" |
| echo " Early stop : patience=${EARLY_STOPPING_PATIENCE}" |
| echo " Started : $(date)" |
| echo "==================================================================" |
|
|
| torchrun \ |
| --nproc_per_node=${NPROC} \ |
| --master_port=${MASTER_PORT} \ |
| train/orpo.py \ |
| --model_path "${BASE_MODEL}" \ |
| --custom_data_path "${DATA_PATH}" \ |
| --output_dir "${OUTPUT_DIR}" \ |
| --epochs ${EPOCHS} \ |
| --lr ${LR} \ |
| --beta ${BETA} \ |
| --batch_size ${BATCH_SIZE} \ |
| --gradient_accumulation_steps ${GRAD_ACCUM} \ |
| --max_length ${MAX_LENGTH} \ |
| --weight_decay ${WEIGHT_DECAY} \ |
| --eval_split_ratio ${EVAL_SPLIT_RATIO} \ |
| --eval_steps ${EVAL_STEPS} \ |
| --early_stopping_patience ${EARLY_STOPPING_PATIENCE} \ |
| --save_total_limit ${SAVE_TOTAL_LIMIT} \ |
| ${EXTRA_ARGS} \ |
| 2>&1 | tee "${LOG_FILE}" \ |
| | grep -v "UserWarning" \ |
| | grep -v "Warning only once" \ |
| | grep -v "Overriding a previously" \ |
| | grep -v "dispatch key:" \ |
| | grep -v "previous kernel:" \ |
| | grep -v "new kernel:" \ |
| | grep -v "operator: flash_attn" \ |
| | grep -v "registered at /usr/local" \ |
| | grep -v "self.m.impl" |
|
|
| EXIT_CODE=$? |
| echo "==================================================================" |
| echo " Done : $(date)" |
| echo " Exit code: ${EXIT_CODE}" |
| if [[ "${EXIT_CODE}" -eq 0 ]]; then |
| echo " ๋ชจ๋ธ ์ ์ฅ ์์น: ${OUTPUT_DIR}" |
| fi |
| echo "==================================================================" |
| exit $EXIT_CODE |
|
|