Upload folder using huggingface_hub
#17
by somebody-to-love - opened
- source/scripts/RESTART_GUIDE.md +23 -0
- source/scripts/apply_optimizations.sh +194 -0
- source/scripts/build_3b_dataset.sh +83 -0
- source/scripts/check_korean_data.sh +178 -0
- source/scripts/clickhouse-watchdog.sh +201 -0
- source/scripts/convert_3b_gguf.sh +229 -0
- source/scripts/convert_to_gguf.sh +92 -0
- source/scripts/convert_to_hf.py +262 -0
- source/scripts/deploy_3b_ollama.sh +146 -0
- source/scripts/deploy_ollama.sh +118 -0
- source/scripts/fix_tokenizer_byte_fallback.py +235 -0
- source/scripts/hourly_status.sh +241 -0
- source/scripts/launch_3b_orpo.sh +177 -0
- source/scripts/launch_3b_pretrain.sh +258 -0
- source/scripts/launch_3b_sft.sh +145 -0
- source/scripts/launch_3b_sft_v2.sh +156 -0
- source/scripts/launch_fp8.sh +94 -0
- source/scripts/launch_hybrid_3b.sh +62 -0
- source/scripts/launch_korean_1b.sh +133 -0
- source/scripts/launch_korean_3b.sh +115 -0
- source/scripts/launch_sft.sh +111 -0
- source/scripts/migrate_qkv_checkpoint.py +230 -0
- source/scripts/monitor_3b.sh +316 -0
- source/scripts/monitor_training.sh +244 -0
- source/scripts/openclaw-watchdog.sh +243 -0
- source/scripts/orpo_eval_watchdog.sh +127 -0
- source/scripts/orpo_hp_sweep.sh +166 -0
- source/scripts/prepare_3b_data.sh +414 -0
- source/scripts/prepare_sft_combined.sh +264 -0
- source/scripts/quality_gate.sh +518 -0
- source/scripts/run_eval.sh +23 -0
- source/scripts/run_eval_full.sh +236 -0
- source/scripts/run_eval_quick.sh +150 -0
- source/scripts/run_pretrain.sh +26 -0
- source/scripts/start-gateway.sh +44 -0
- source/scripts/telegram_notify.py +168 -0
- source/scripts/test_ollama_repetition.py +148 -0
- source/scripts/training_watchdog.sh +292 -0
- source/scripts/upload_to_huggingface.py +182 -0
source/scripts/RESTART_GUIDE.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FRANKENSTALLM 3B — Optimization Restart Guide
|
| 2 |
+
|
| 3 |
+
## Quick restart (all optimizations applied automatically):
|
| 4 |
+
```bash
|
| 5 |
+
bash scripts/apply_optimizations.sh
|
| 6 |
+
```
|
| 7 |
+
|
| 8 |
+
## Validate only (no restart):
|
| 9 |
+
```bash
|
| 10 |
+
bash scripts/apply_optimizations.sh --test-only
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
## Manual steps if auto-migration fails:
|
| 14 |
+
1. Stop: `kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)`
|
| 15 |
+
2. Migrate: `python3 scripts/migrate_qkv_checkpoint.py checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX`
|
| 16 |
+
3. Restart: `bash scripts/launch_3b_pretrain.sh`
|
| 17 |
+
|
| 18 |
+
## Rollback (undo QKV fusion):
|
| 19 |
+
```bash
|
| 20 |
+
CKPT=checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
|
| 21 |
+
cp ${CKPT}/model.pt.backup_pre_qkv ${CKPT}/model.pt
|
| 22 |
+
git checkout model/attention.py # restore original attention code
|
| 23 |
+
```
|
source/scripts/apply_optimizations.sh
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# apply_optimizations.sh — Apply v2 optimizations and restart training
|
| 4 |
+
#
|
| 5 |
+
# Optimizations applied:
|
| 6 |
+
# 1. QKV Projection Fusion (+8-12% throughput)
|
| 7 |
+
# 2. NUMA CPU Affinity (fix 69% cross-NUMA workers)
|
| 8 |
+
# 3. Batch size 4→5 (11h saved over full run)
|
| 9 |
+
# 4. NCCL NVLS algorithm + 256MB buffers
|
| 10 |
+
# 5. DDP bucket_cap_mb 400→800
|
| 11 |
+
# 6. DataLoader num_workers 4→6, prefetch_factor 3→4
|
| 12 |
+
# 7. MADV_RANDOM + WILLNEED for PackedDataset
|
| 13 |
+
# 8. numactl --interleave=all on torchrun
|
| 14 |
+
#
|
| 15 |
+
# Usage:
|
| 16 |
+
# bash scripts/apply_optimizations.sh # full migration
|
| 17 |
+
# bash scripts/apply_optimizations.sh --test-only # just validate, don't restart
|
| 18 |
+
# bash scripts/apply_optimizations.sh --skip-stop # don't stop current training
|
| 19 |
+
# =============================================================================
|
| 20 |
+
set -u
|
| 21 |
+
|
| 22 |
+
cd "$(dirname "$0")/.."
|
| 23 |
+
|
| 24 |
+
RUN_NAME="korean_3b_fp8_run1"
|
| 25 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 26 |
+
PID_FILE="${CKPT_DIR}/train.pid"
|
| 27 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 28 |
+
|
| 29 |
+
TEST_ONLY=false
|
| 30 |
+
SKIP_STOP=false
|
| 31 |
+
for arg in "$@"; do
|
| 32 |
+
case "$arg" in
|
| 33 |
+
--test-only) TEST_ONLY=true ;;
|
| 34 |
+
--skip-stop) SKIP_STOP=true ;;
|
| 35 |
+
esac
|
| 36 |
+
done
|
| 37 |
+
|
| 38 |
+
echo "=================================================================="
|
| 39 |
+
echo " FRANKENSTALLM 3B — Optimization Migration v2"
|
| 40 |
+
echo " $(date)"
|
| 41 |
+
echo "=================================================================="
|
| 42 |
+
|
| 43 |
+
# ---- Step 1: Validate all modified files --------------------------------
|
| 44 |
+
echo ""
|
| 45 |
+
echo "[1/6] Validating modified files..."
|
| 46 |
+
ERRORS=0
|
| 47 |
+
|
| 48 |
+
for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do
|
| 49 |
+
if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then
|
| 50 |
+
echo " ✓ $pyfile — syntax OK"
|
| 51 |
+
else
|
| 52 |
+
echo " ✗ $pyfile — SYNTAX ERROR"
|
| 53 |
+
((ERRORS++))
|
| 54 |
+
fi
|
| 55 |
+
done
|
| 56 |
+
|
| 57 |
+
if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then
|
| 58 |
+
echo " ✓ scripts/launch_3b_pretrain.sh — syntax OK"
|
| 59 |
+
else
|
| 60 |
+
echo " ✗ scripts/launch_3b_pretrain.sh — SYNTAX ERROR"
|
| 61 |
+
((ERRORS++))
|
| 62 |
+
fi
|
| 63 |
+
|
| 64 |
+
# Check YAML
|
| 65 |
+
python3 -c "
|
| 66 |
+
import yaml
|
| 67 |
+
with open('configs/korean_3b_fp8.yaml') as f:
|
| 68 |
+
cfg = yaml.safe_load(f)
|
| 69 |
+
assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}'
|
| 70 |
+
print(' ✓ configs/korean_3b_fp8.yaml — valid, batch_size=5')
|
| 71 |
+
" 2>/dev/null || { echo " ✗ configs/korean_3b_fp8.yaml — INVALID"; ((ERRORS++)); }
|
| 72 |
+
|
| 73 |
+
if [[ $ERRORS -gt 0 ]]; then
|
| 74 |
+
echo ""
|
| 75 |
+
echo "[ERROR] $ERRORS file(s) failed validation. Aborting."
|
| 76 |
+
exit 1
|
| 77 |
+
fi
|
| 78 |
+
echo " All files validated successfully."
|
| 79 |
+
|
| 80 |
+
if $TEST_ONLY; then
|
| 81 |
+
echo ""
|
| 82 |
+
echo "[INFO] --test-only mode. Exiting without restart."
|
| 83 |
+
exit 0
|
| 84 |
+
fi
|
| 85 |
+
|
| 86 |
+
# ---- Step 2: Stop current training (graceful) ---------------------------
|
| 87 |
+
if ! $SKIP_STOP; then
|
| 88 |
+
echo ""
|
| 89 |
+
echo "[2/6] Stopping current training (SIGTERM → emergency checkpoint)..."
|
| 90 |
+
if [[ -f "$PID_FILE" ]]; then
|
| 91 |
+
PID=$(cat "$PID_FILE")
|
| 92 |
+
if kill -0 "$PID" 2>/dev/null; then
|
| 93 |
+
echo " Sending SIGTERM to PID $PID..."
|
| 94 |
+
kill "$PID"
|
| 95 |
+
echo " Waiting for graceful shutdown (up to 120s)..."
|
| 96 |
+
for i in $(seq 1 120); do
|
| 97 |
+
if ! kill -0 "$PID" 2>/dev/null; then
|
| 98 |
+
echo " Process stopped after ${i}s"
|
| 99 |
+
break
|
| 100 |
+
fi
|
| 101 |
+
sleep 1
|
| 102 |
+
done
|
| 103 |
+
if kill -0 "$PID" 2>/dev/null; then
|
| 104 |
+
echo " [WARN] Process still running after 120s. Force killing..."
|
| 105 |
+
kill -9 "$PID" 2>/dev/null || true
|
| 106 |
+
sleep 2
|
| 107 |
+
fi
|
| 108 |
+
else
|
| 109 |
+
echo " Process $PID not running."
|
| 110 |
+
fi
|
| 111 |
+
else
|
| 112 |
+
echo " No PID file found."
|
| 113 |
+
fi
|
| 114 |
+
|
| 115 |
+
# Wait for all GPU processes to clear
|
| 116 |
+
echo " Waiting for GPU processes to terminate..."
|
| 117 |
+
for i in $(seq 1 30); do
|
| 118 |
+
if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then
|
| 119 |
+
echo " All GPU processes cleared."
|
| 120 |
+
break
|
| 121 |
+
fi
|
| 122 |
+
sleep 1
|
| 123 |
+
done
|
| 124 |
+
fi
|
| 125 |
+
|
| 126 |
+
# ---- Step 3: Find and migrate latest checkpoint -------------------------
|
| 127 |
+
echo ""
|
| 128 |
+
echo "[3/6] Migrating latest checkpoint (QKV fusion)..."
|
| 129 |
+
LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1)
|
| 130 |
+
if [[ -z "$LATEST_CKPT" ]]; then
|
| 131 |
+
echo " [ERROR] No checkpoint found!"
|
| 132 |
+
exit 1
|
| 133 |
+
fi
|
| 134 |
+
echo " Latest checkpoint: $LATEST_CKPT"
|
| 135 |
+
|
| 136 |
+
# Backup original model.pt
|
| 137 |
+
cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv"
|
| 138 |
+
echo " Backup created: model.pt.backup_pre_qkv"
|
| 139 |
+
|
| 140 |
+
# Run migration
|
| 141 |
+
python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT"
|
| 142 |
+
echo " QKV fusion migration complete."
|
| 143 |
+
|
| 144 |
+
# ---- Step 4: Quick validation test (5 steps) ----------------------------
|
| 145 |
+
echo ""
|
| 146 |
+
echo "[4/6] Running 5-step validation test..."
|
| 147 |
+
# Use single GPU for fast test
|
| 148 |
+
timeout 120 python3 train/pretrain.py \
|
| 149 |
+
--config configs/korean_3b_fp8.yaml \
|
| 150 |
+
--train_data data/3b_train.bin \
|
| 151 |
+
--checkpoint_dir /tmp/frankenstallm_test \
|
| 152 |
+
--max_steps 5 \
|
| 153 |
+
--batch_size 5 \
|
| 154 |
+
--resume "$LATEST_CKPT" \
|
| 155 |
+
2>&1 | tail -10
|
| 156 |
+
|
| 157 |
+
TEST_EXIT=$?
|
| 158 |
+
if [[ $TEST_EXIT -eq 0 ]]; then
|
| 159 |
+
echo " ✓ 5-step test passed!"
|
| 160 |
+
else
|
| 161 |
+
echo " ✗ 5-step test FAILED (exit code $TEST_EXIT)"
|
| 162 |
+
echo " [WARN] Restoring original checkpoint..."
|
| 163 |
+
cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt"
|
| 164 |
+
echo " Original checkpoint restored. Aborting."
|
| 165 |
+
exit 1
|
| 166 |
+
fi
|
| 167 |
+
|
| 168 |
+
# ---- Step 5: Clean up test artifacts ------------------------------------
|
| 169 |
+
echo ""
|
| 170 |
+
echo "[5/6] Cleaning up test artifacts..."
|
| 171 |
+
rm -rf /tmp/frankenstallm_test
|
| 172 |
+
|
| 173 |
+
# ---- Step 6: Launch full training with optimizations --------------------
|
| 174 |
+
echo ""
|
| 175 |
+
echo "[6/6] Launching optimized training..."
|
| 176 |
+
echo ""
|
| 177 |
+
echo " Changes applied:"
|
| 178 |
+
echo " • QKV Projection Fusion (single GEMM)"
|
| 179 |
+
echo " • NUMA CPU Affinity (cores 0-35→GPU0-3, 36-71→GPU4-7)"
|
| 180 |
+
echo " • Batch size: 4 → 5"
|
| 181 |
+
echo " • NCCL: NVLS,Ring algorithm, 256MB buffers"
|
| 182 |
+
echo " • DDP: bucket_cap_mb 400 → 800"
|
| 183 |
+
echo " • DataLoader: 4→6 workers, prefetch 3→4"
|
| 184 |
+
echo " • MADV_RANDOM + WILLNEED for dataset mmap"
|
| 185 |
+
echo " • numactl --interleave=all on torchrun"
|
| 186 |
+
echo ""
|
| 187 |
+
|
| 188 |
+
bash scripts/launch_3b_pretrain.sh
|
| 189 |
+
|
| 190 |
+
echo ""
|
| 191 |
+
echo "=================================================================="
|
| 192 |
+
echo " Migration complete! Monitor with:"
|
| 193 |
+
echo " tail -f ${LOG_FILE}"
|
| 194 |
+
echo "=================================================================="
|
source/scripts/build_3b_dataset.sh
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
cd "$(dirname "$0")/.."
|
| 4 |
+
DATA="data"
|
| 5 |
+
|
| 6 |
+
echo "=================================================================="
|
| 7 |
+
echo " 3B 통합 데이터셋 빌드 | 시작: $(date)"
|
| 8 |
+
echo "=================================================================="
|
| 9 |
+
|
| 10 |
+
# 청크 병합 함수
|
| 11 |
+
merge_chunks() {
|
| 12 |
+
PREFIX="$1"
|
| 13 |
+
OUTPUT="$2"
|
| 14 |
+
CHUNKS=$(ls "${PREFIX}".bin.chunk* 2>/dev/null | sort || true)
|
| 15 |
+
if [[ -z "$CHUNKS" ]]; then return; fi
|
| 16 |
+
if [[ -f "$OUTPUT" ]]; then echo " [SKIP] $OUTPUT 이미 존재"; return; fi
|
| 17 |
+
echo " 청크 병합: $(basename $PREFIX)"
|
| 18 |
+
cat $CHUNKS > "$OUTPUT"
|
| 19 |
+
echo " 완료: $(du -sh $OUTPUT | cut -f1)"
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
merge_chunks "$DATA/cosmo_auto_math_text_train" "$DATA/cosmo_auto_math_text_train.bin"
|
| 23 |
+
merge_chunks "$DATA/cosmo_auto_math_text_val" "$DATA/cosmo_auto_math_text_val.bin"
|
| 24 |
+
merge_chunks "$DATA/cosmo_web_v2_train" "$DATA/cosmo_web_v2_train.bin"
|
| 25 |
+
merge_chunks "$DATA/cosmo_web_v2_val" "$DATA/cosmo_web_v2_val.bin"
|
| 26 |
+
|
| 27 |
+
TRAIN_FILES=""
|
| 28 |
+
for f in \
|
| 29 |
+
"$DATA/korean_train.bin" \
|
| 30 |
+
"$DATA/hplt_ko_train.bin" \
|
| 31 |
+
"$DATA/korean_c4_train.bin" \
|
| 32 |
+
"$DATA/cc100_ko_train.bin" \
|
| 33 |
+
"$DATA/namuwiki_2023b_train.bin" \
|
| 34 |
+
"$DATA/korean_namuwiki_train.bin" \
|
| 35 |
+
"$DATA/wikipedia_ko_train.bin" \
|
| 36 |
+
"$DATA/korean_wiki_train.bin" \
|
| 37 |
+
"$DATA/open_web_math_train.bin" \
|
| 38 |
+
"$DATA/mathpile_train.bin" \
|
| 39 |
+
"$DATA/cosmo_auto_math_text_train.bin" \
|
| 40 |
+
"$DATA/cosmo_stories_train.bin" \
|
| 41 |
+
"$DATA/cosmo_web_v2_train.bin" \
|
| 42 |
+
"$DATA/cosmo_stanford_train.bin" \
|
| 43 |
+
"$DATA/cosmo_wikihow_train.bin" \
|
| 44 |
+
"$DATA/cosmo_openstax_train.bin" \
|
| 45 |
+
"$DATA/cosmo_khanacademy_train.bin"; do
|
| 46 |
+
[[ -f "$f" ]] && TRAIN_FILES="$TRAIN_FILES $f"
|
| 47 |
+
done
|
| 48 |
+
|
| 49 |
+
VAL_FILES=""
|
| 50 |
+
for f in \
|
| 51 |
+
"$DATA/korean_val.bin" \
|
| 52 |
+
"$DATA/hplt_ko_val.bin" \
|
| 53 |
+
"$DATA/korean_c4_val.bin" \
|
| 54 |
+
"$DATA/cc100_ko_val.bin" \
|
| 55 |
+
"$DATA/namuwiki_2023b_val.bin" \
|
| 56 |
+
"$DATA/open_web_math_val.bin" \
|
| 57 |
+
"$DATA/mathpile_val.bin" \
|
| 58 |
+
"$DATA/cosmo_auto_math_text_val.bin" \
|
| 59 |
+
"$DATA/cosmo_stories_val.bin" \
|
| 60 |
+
"$DATA/cosmo_web_v2_val.bin"; do
|
| 61 |
+
[[ -f "$f" ]] && VAL_FILES="$VAL_FILES $f"
|
| 62 |
+
done
|
| 63 |
+
|
| 64 |
+
echo ""
|
| 65 |
+
echo "train 파일 병합 → data/3b_train.bin ..."
|
| 66 |
+
python3 data/merge_bins.py $TRAIN_FILES data/3b_train.bin
|
| 67 |
+
|
| 68 |
+
echo ""
|
| 69 |
+
echo "val 파일 병합 → data/3b_val.bin ..."
|
| 70 |
+
python3 data/merge_bins.py $VAL_FILES data/3b_val.bin
|
| 71 |
+
|
| 72 |
+
echo ""
|
| 73 |
+
echo "=================================================================="
|
| 74 |
+
du -sh data/3b_train.bin data/3b_val.bin
|
| 75 |
+
python3 -c "
|
| 76 |
+
import os
|
| 77 |
+
sz = os.path.getsize('data/3b_train.bin')
|
| 78 |
+
tok = sz // 2
|
| 79 |
+
print(f'3b_train: {tok/1e9:.2f}B tokens')
|
| 80 |
+
print(f'60B 달성 에포크: {60/(tok/1e9):.1f}x 반복 필요')
|
| 81 |
+
"
|
| 82 |
+
echo "완료: $(date)"
|
| 83 |
+
echo "=================================================================="
|
source/scripts/check_korean_data.sh
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# 한국어 학습 데이터 현황 확인 스크립트
|
| 4 |
+
# 용도: 한국어 데이터셋 상태, 토크나이저, 원본 데이터 파일 확인
|
| 5 |
+
|
| 6 |
+
set -e
|
| 7 |
+
|
| 8 |
+
# 프로젝트 루트 (이 스크립트 실행 위치 기준)
|
| 9 |
+
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 10 |
+
cd "${PROJECT_ROOT}"
|
| 11 |
+
|
| 12 |
+
echo "=== 한국어 학습 데이터 현황 ==="
|
| 13 |
+
echo ""
|
| 14 |
+
|
| 15 |
+
# ============================================================================
|
| 16 |
+
# 1. 학습용 바이너리 데이터 확인
|
| 17 |
+
# ============================================================================
|
| 18 |
+
echo "[ 학습 바이너리 데이터 ]"
|
| 19 |
+
|
| 20 |
+
check_binary_data() {
|
| 21 |
+
local file=$1
|
| 22 |
+
local name=$2
|
| 23 |
+
|
| 24 |
+
if [ -f "$file" ]; then
|
| 25 |
+
local size=$(du -h "$file" | cut -f1)
|
| 26 |
+
|
| 27 |
+
# Python + numpy memmap으로 토큰 수 계산
|
| 28 |
+
# 바이너리는 uint32 형태로 저장되어 있음 (4 bytes per token)
|
| 29 |
+
local token_count=$(python3 -c "
|
| 30 |
+
import numpy as np
|
| 31 |
+
try:
|
| 32 |
+
data = np.memmap('$file', dtype=np.uint32, mode='r')
|
| 33 |
+
print(len(data))
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print('error')
|
| 36 |
+
" 2>/dev/null || echo "error")
|
| 37 |
+
|
| 38 |
+
if [ "$token_count" != "error" ] && [ ! -z "$token_count" ]; then
|
| 39 |
+
# 토큰 수를 포맷팅 (천 단위 쉼표)
|
| 40 |
+
local formatted_tokens=$(printf "%'d" "$token_count")
|
| 41 |
+
|
| 42 |
+
# 1B 모델 학습 스텝 계산
|
| 43 |
+
# tokens_per_step = batch_size * grad_accum * seq_len * num_gpus
|
| 44 |
+
# = 8 * 4 * 4096 * 8 = 1,048,576 tokens/step
|
| 45 |
+
local tokens_per_step=1048576
|
| 46 |
+
local estimated_steps=$((token_count / tokens_per_step))
|
| 47 |
+
|
| 48 |
+
printf " %-20s : 존재 (%s, %'d 토큰, ~%'d steps)\n" \
|
| 49 |
+
"$name" "$size" "$token_count" "$estimated_steps"
|
| 50 |
+
else
|
| 51 |
+
printf " %-20s : 존재 (%s, 토큰 계산 실패)\n" "$name" "$size"
|
| 52 |
+
fi
|
| 53 |
+
else
|
| 54 |
+
printf " %-20s : 없음\n" "$name"
|
| 55 |
+
fi
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
check_binary_data "data/korean_train.bin" "korean_train.bin"
|
| 59 |
+
check_binary_data "data/korean_val.bin" "korean_val.bin"
|
| 60 |
+
check_binary_data "data/train.bin" "train.bin"
|
| 61 |
+
check_binary_data "data/val.bin" "val.bin"
|
| 62 |
+
|
| 63 |
+
echo ""
|
| 64 |
+
|
| 65 |
+
# ============================================================================
|
| 66 |
+
# 2. 토크나이저 확인
|
| 67 |
+
# ============================================================================
|
| 68 |
+
echo "[ 토크나이저 ]"
|
| 69 |
+
|
| 70 |
+
check_tokenizer() {
|
| 71 |
+
local dir=$1
|
| 72 |
+
local name=$2
|
| 73 |
+
|
| 74 |
+
if [ -d "$dir" ]; then
|
| 75 |
+
local files=$(find "$dir" -type f | wc -l)
|
| 76 |
+
printf " %-20s : 존재 (%d개 파일)\n" "$name" "$files"
|
| 77 |
+
else
|
| 78 |
+
printf " %-20s : 없음\n" "$name"
|
| 79 |
+
fi
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
check_tokenizer "tokenizer/korean_sp" "korean_sp"
|
| 83 |
+
check_tokenizer "tokenizer" "default tokenizer"
|
| 84 |
+
|
| 85 |
+
echo ""
|
| 86 |
+
|
| 87 |
+
# ============================================================================
|
| 88 |
+
# 3. 원본 데이터 디렉토리 확인
|
| 89 |
+
# ============================================================================
|
| 90 |
+
echo "[ 원본 데이터 ]"
|
| 91 |
+
|
| 92 |
+
check_raw_data() {
|
| 93 |
+
local dir=$1
|
| 94 |
+
local name=$2
|
| 95 |
+
|
| 96 |
+
if [ -d "$dir" ]; then
|
| 97 |
+
local file_count=$(find "$dir" -maxdepth 1 -type f | wc -l)
|
| 98 |
+
local total_size=$(du -sh "$dir" 2>/dev/null | cut -f1)
|
| 99 |
+
|
| 100 |
+
if [ $file_count -eq 0 ]; then
|
| 101 |
+
printf " %-20s : 없음 (디렉토리만 존재, 0 파일)\n" "$name"
|
| 102 |
+
else
|
| 103 |
+
printf " %-20s : %'d 파일 (%s)\n" "$name" "$file_count" "$total_size"
|
| 104 |
+
fi
|
| 105 |
+
else
|
| 106 |
+
printf " %-20s : 없음\n" "$name"
|
| 107 |
+
fi
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
check_raw_data "data/raw/cc100_ko" "cc100_ko/"
|
| 111 |
+
check_raw_data "data/raw/c4_ko" "c4_ko/"
|
| 112 |
+
check_raw_data "data/raw/namuwiki_ko" "namuwiki_ko/"
|
| 113 |
+
|
| 114 |
+
# 위키 데이터는 raw/ 직접 하위
|
| 115 |
+
echo ""
|
| 116 |
+
echo "[ 위키피디아 데이터 ]"
|
| 117 |
+
ko_wiki_count=$(find "data/raw" -maxdepth 1 -name "ko_wiki_*.txt" | wc -l)
|
| 118 |
+
en_wiki_count=$(find "data/raw" -maxdepth 1 -name "en_wiki_*.txt" | wc -l)
|
| 119 |
+
ko_wiki_size=$(du -sh "data/raw" 2>/dev/null | cut -f1)
|
| 120 |
+
|
| 121 |
+
if [ $ko_wiki_count -gt 0 ]; then
|
| 122 |
+
printf " %-20s : %'d 파일\n" "ko_wiki" "$ko_wiki_count"
|
| 123 |
+
fi
|
| 124 |
+
|
| 125 |
+
if [ $en_wiki_count -gt 0 ]; then
|
| 126 |
+
printf " %-20s : %'d 파일\n" "en_wiki" "$en_wiki_count"
|
| 127 |
+
fi
|
| 128 |
+
|
| 129 |
+
echo ""
|
| 130 |
+
|
| 131 |
+
# ============================================================================
|
| 132 |
+
# 4. 종합 상태 요약
|
| 133 |
+
# ============================================================================
|
| 134 |
+
echo "[ 종합 상태 ]"
|
| 135 |
+
|
| 136 |
+
# 학습용 바이너리 데이터 확인
|
| 137 |
+
binary_ready=false
|
| 138 |
+
if [ -f "data/korean_train.bin" ] && [ -f "data/korean_val.bin" ]; then
|
| 139 |
+
binary_ready=true
|
| 140 |
+
elif [ -f "data/train.bin" ] && [ -f "data/val.bin" ]; then
|
| 141 |
+
binary_ready=true
|
| 142 |
+
fi
|
| 143 |
+
|
| 144 |
+
# 토크나이저 확인
|
| 145 |
+
tokenizer_ready=false
|
| 146 |
+
if [ -d "tokenizer/korean_sp" ] && [ -f "tokenizer/korean_sp/tokenizer.model" ]; then
|
| 147 |
+
tokenizer_ready=true
|
| 148 |
+
fi
|
| 149 |
+
|
| 150 |
+
# 원본 데이터 확인
|
| 151 |
+
raw_ready=false
|
| 152 |
+
if [ -d "data/raw/c4_ko" ] || [ -d "data/raw/namuwiki_ko" ] || [ -d "data/raw/cc100_ko" ]; then
|
| 153 |
+
count=$(find "data/raw/c4_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)
|
| 154 |
+
count=$((count + $(find "data/raw/namuwiki_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
|
| 155 |
+
count=$((count + $(find "data/raw/cc100_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
|
| 156 |
+
if [ $count -gt 0 ]; then
|
| 157 |
+
raw_ready=true
|
| 158 |
+
fi
|
| 159 |
+
fi
|
| 160 |
+
|
| 161 |
+
printf " 학습용 바이너리 : %s\n" "$([ "$binary_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
|
| 162 |
+
printf " 토크나이저 : %s\n" "$([ "$tokenizer_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
|
| 163 |
+
printf " 원본 데이터 : %s\n" "$([ "$raw_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
|
| 164 |
+
|
| 165 |
+
echo ""
|
| 166 |
+
|
| 167 |
+
# ============================================================================
|
| 168 |
+
# 5. 학습 설정 파라미터 정보
|
| 169 |
+
# ============================================================================
|
| 170 |
+
echo "[ 학습 설정 (1B 모델 기준) ]"
|
| 171 |
+
echo " 배치 사이즈 : 8"
|
| 172 |
+
echo " 시퀀스 길이 : 4096"
|
| 173 |
+
echo " GPU 수 : 8"
|
| 174 |
+
echo " 그래디언트 누적 : 4"
|
| 175 |
+
echo " 토큰/스텝 : 8 × 4 × 4096 × 8 = 1,048,576"
|
| 176 |
+
echo ""
|
| 177 |
+
|
| 178 |
+
echo "=== 검사 완료 ==="
|
source/scripts/clickhouse-watchdog.sh
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# clickhouse-watchdog.sh — ClickHouse 헬스체크 + 자동 재시작
|
| 4 |
+
# crontab에 등록하여 1분마다 실행
|
| 5 |
+
#
|
| 6 |
+
# Usage:
|
| 7 |
+
# */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/clickhouse-watchdog.sh
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
set -euo pipefail
|
| 11 |
+
|
| 12 |
+
# ── 설정 ──────────────────────────────────────────────
|
| 13 |
+
CH_BIN="/PROJECT/0325120031_A/ghong/taketimes/clickhouse-bin"
|
| 14 |
+
CH_CONFIG="/PROJECT/0325120031_A/ghong/taketimes/llm-bang/configs/clickhouse-config.xml"
|
| 15 |
+
TCP_PORT=9000
|
| 16 |
+
HTTP_PORT=8123
|
| 17 |
+
HOST="127.0.0.1"
|
| 18 |
+
|
| 19 |
+
LOG_DIR="/tmp/clickhouse"
|
| 20 |
+
LOG_FILE="${LOG_DIR}/watchdog.log"
|
| 21 |
+
MAX_LOG_SIZE=$((10 * 1024 * 1024)) # 10MB 로테이션
|
| 22 |
+
|
| 23 |
+
RESTART_COOLDOWN=180 # 초 — 재시작 후 이 시간 내 재시도 방지
|
| 24 |
+
LAST_RESTART_FILE="/tmp/clickhouse-last-restart"
|
| 25 |
+
HEALTH_CHECK_TIMEOUT=5 # 초 — 헬스체크 curl/query 타임아웃
|
| 26 |
+
|
| 27 |
+
# ── 함수 ──────────────────────────────────────────────
|
| 28 |
+
mkdir -p "$LOG_DIR"
|
| 29 |
+
|
| 30 |
+
log() {
|
| 31 |
+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [clickhouse-watchdog] $*" >> "$LOG_FILE"
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
rotate_log() {
|
| 35 |
+
local file="$1"
|
| 36 |
+
if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
|
| 37 |
+
mv "$file" "${file}.old"
|
| 38 |
+
log "Log rotated: $file"
|
| 39 |
+
fi
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
is_tcp_port_open() {
|
| 43 |
+
if command -v ss &>/dev/null; then
|
| 44 |
+
ss -tlnH "sport = :${TCP_PORT}" 2>/dev/null | grep -q "$TCP_PORT"
|
| 45 |
+
else
|
| 46 |
+
(echo > /dev/tcp/"$HOST"/"$TCP_PORT") 2>/dev/null
|
| 47 |
+
fi
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
is_http_responding() {
|
| 51 |
+
# HTTP 인터페이스 핑 — ClickHouse는 GET / 에 "Ok.\n" 응답
|
| 52 |
+
if command -v curl &>/dev/null; then
|
| 53 |
+
local resp
|
| 54 |
+
resp=$(curl -s --max-time "$HEALTH_CHECK_TIMEOUT" "http://${HOST}:${HTTP_PORT}/ping" 2>/dev/null || true)
|
| 55 |
+
[[ "$resp" == "Ok." ]]
|
| 56 |
+
else
|
| 57 |
+
# curl 없으면 TCP 포트만 확인
|
| 58 |
+
(echo > /dev/tcp/"$HOST"/"$HTTP_PORT") 2>/dev/null
|
| 59 |
+
fi
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
is_process_alive() {
|
| 63 |
+
# ClickHouse 내부 watchdog 프로세스명: "clickhouse-watchdog" (바이너리 자체)
|
| 64 |
+
# 이 스크립트(clickhouse-watchdog.sh)와 구분하기 위해 --daemon 플래그 포함 패턴 사용
|
| 65 |
+
pgrep -f "clickhouse.*server.*--daemon" >/dev/null 2>&1
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
can_execute_query() {
|
| 69 |
+
# 실제 쿼리 실행으로 서버가 응답하는지 확인
|
| 70 |
+
local result
|
| 71 |
+
result=$("$CH_BIN" client --port "$TCP_PORT" --query "SELECT 1" 2>/dev/null || true)
|
| 72 |
+
[[ "$result" == "1" ]]
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
cooldown_active() {
|
| 76 |
+
if [[ -f "$LAST_RESTART_FILE" ]]; then
|
| 77 |
+
local last_restart now diff
|
| 78 |
+
last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
|
| 79 |
+
now=$(date +%s)
|
| 80 |
+
diff=$(( now - last_restart ))
|
| 81 |
+
if [[ $diff -lt $RESTART_COOLDOWN ]]; then
|
| 82 |
+
return 0 # 쿨다운 중
|
| 83 |
+
fi
|
| 84 |
+
fi
|
| 85 |
+
return 1 # 쿨다운 아님
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
stop_existing() {
|
| 89 |
+
log "Stopping existing ClickHouse processes..."
|
| 90 |
+
local my_pid=$$
|
| 91 |
+
local pids
|
| 92 |
+
|
| 93 |
+
# 정상 종료 시도 (서버 프로세스)
|
| 94 |
+
pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
|
| 95 |
+
if [[ -n "$pids" ]]; then
|
| 96 |
+
log "Sending TERM to PIDs: $pids"
|
| 97 |
+
echo "$pids" | xargs kill -TERM 2>/dev/null || true
|
| 98 |
+
sleep 3
|
| 99 |
+
# 아직 살아있으면 강제 종료
|
| 100 |
+
pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
|
| 101 |
+
if [[ -n "$pids" ]]; then
|
| 102 |
+
log "Force killing PIDs: $pids"
|
| 103 |
+
echo "$pids" | xargs kill -9 2>/dev/null || true
|
| 104 |
+
sleep 2
|
| 105 |
+
fi
|
| 106 |
+
fi
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
start_server() {
|
| 110 |
+
log "Starting ClickHouse server (daemon mode)..."
|
| 111 |
+
|
| 112 |
+
# 기존 프로세스 정리
|
| 113 |
+
stop_existing
|
| 114 |
+
|
| 115 |
+
# 필요한 디렉토리 생성
|
| 116 |
+
mkdir -p /tmp/clickhouse/logs
|
| 117 |
+
mkdir -p /tmp/clickhouse-tmp
|
| 118 |
+
|
| 119 |
+
# 데몬 모드로 시작
|
| 120 |
+
"$CH_BIN" server --config-file="$CH_CONFIG" --daemon
|
| 121 |
+
|
| 122 |
+
# 시작 후 대기 + 확인 (최대 15초)
|
| 123 |
+
local attempts=0
|
| 124 |
+
local max_attempts=15
|
| 125 |
+
while [[ $attempts -lt $max_attempts ]]; do
|
| 126 |
+
sleep 1
|
| 127 |
+
attempts=$((attempts + 1))
|
| 128 |
+
if is_tcp_port_open && can_execute_query; then
|
| 129 |
+
date +%s > "$LAST_RESTART_FILE"
|
| 130 |
+
log "ClickHouse started successfully (took ${attempts}s)"
|
| 131 |
+
return 0
|
| 132 |
+
fi
|
| 133 |
+
done
|
| 134 |
+
|
| 135 |
+
date +%s > "$LAST_RESTART_FILE"
|
| 136 |
+
log "ERROR: ClickHouse did not respond within ${max_attempts}s after start"
|
| 137 |
+
return 1
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
# ── 메인 로직 ─────────────────────────────────────────
|
| 141 |
+
rotate_log "$LOG_FILE"
|
| 142 |
+
|
| 143 |
+
# 1) 바이너리 존재 확인
|
| 144 |
+
if [[ ! -x "$CH_BIN" ]]; then
|
| 145 |
+
log "FATAL: ClickHouse binary not found or not executable: $CH_BIN"
|
| 146 |
+
exit 1
|
| 147 |
+
fi
|
| 148 |
+
|
| 149 |
+
# 2) 프로세스 + 포트 + 쿼리 체크
|
| 150 |
+
process_ok=false
|
| 151 |
+
port_ok=false
|
| 152 |
+
query_ok=false
|
| 153 |
+
|
| 154 |
+
if is_process_alive; then
|
| 155 |
+
process_ok=true
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
if is_tcp_port_open; then
|
| 159 |
+
port_ok=true
|
| 160 |
+
fi
|
| 161 |
+
|
| 162 |
+
if $port_ok && can_execute_query; then
|
| 163 |
+
query_ok=true
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
# 3) 판단
|
| 167 |
+
if $process_ok && $port_ok && $query_ok; then
|
| 168 |
+
# 완전 정상 — 아무것도 안 함
|
| 169 |
+
exit 0
|
| 170 |
+
fi
|
| 171 |
+
|
| 172 |
+
# HTTP도 확인 (진단 로그용)
|
| 173 |
+
http_ok=false
|
| 174 |
+
if is_http_responding; then
|
| 175 |
+
http_ok=true
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
# 비정상 상태 로깅
|
| 179 |
+
if $process_ok && $port_ok && ! $query_ok; then
|
| 180 |
+
log "WARN: Process alive, port open, but query failed. Possible hung state."
|
| 181 |
+
elif $process_ok && ! $port_ok; then
|
| 182 |
+
log "WARN: Process alive but TCP port $TCP_PORT not listening."
|
| 183 |
+
elif ! $process_ok; then
|
| 184 |
+
log "WARN: ClickHouse is completely down (no process found)."
|
| 185 |
+
fi
|
| 186 |
+
log "Status: process=$process_ok port=$port_ok query=$query_ok http=$http_ok"
|
| 187 |
+
|
| 188 |
+
# 4) 쿨다운 체크
|
| 189 |
+
if cooldown_active; then
|
| 190 |
+
log "Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
|
| 191 |
+
exit 0
|
| 192 |
+
fi
|
| 193 |
+
|
| 194 |
+
# 5) 재시작
|
| 195 |
+
log "Attempting ClickHouse restart..."
|
| 196 |
+
if start_server; then
|
| 197 |
+
log "ClickHouse restart SUCCESS"
|
| 198 |
+
else
|
| 199 |
+
log "ClickHouse restart FAILED"
|
| 200 |
+
exit 1
|
| 201 |
+
fi
|
source/scripts/convert_3b_gguf.sh
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# convert_3b_gguf.sh — 3B 모델 HuggingFace → GGUF 변환 + 다중 양자화
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/convert_3b_gguf.sh [options]
|
| 7 |
+
#
|
| 8 |
+
# Options:
|
| 9 |
+
# --input_dir DIR HF 포맷 모델 디렉토리 (default: outputs/hf_korean_3b_orpo)
|
| 10 |
+
# --out_dir DIR GGUF 출력 디렉토리 (default: outputs/gguf)
|
| 11 |
+
# --checkpoint DIR 커스텀 체크포인트 디렉토리 (지정 시 HF 변환 선행 실행)
|
| 12 |
+
# --skip_hf_conv HF 변환 단계 건너뜀 (이미 HF 포맷 존재 시)
|
| 13 |
+
# --skip_quant 양자화 단계 건너뜀 (F16 GGUF만 생성)
|
| 14 |
+
#
|
| 15 |
+
# Pipeline:
|
| 16 |
+
# 1. [선택] 커스텀 체크포인트 → HF transformers 포맷 (convert_to_hf.py)
|
| 17 |
+
# 2. HF → F16 GGUF (llama.cpp/convert_hf_to_gguf.py)
|
| 18 |
+
# 3. F16 GGUF → Q4_K_M, Q5_K_M, Q8_0 양자화 (llama-quantize)
|
| 19 |
+
#
|
| 20 |
+
# Outputs:
|
| 21 |
+
# outputs/gguf/frankenstallm-3b-f16.gguf
|
| 22 |
+
# outputs/gguf/frankenstallm-3b-Q4_K_M.gguf — 권장 (Ollama용)
|
| 23 |
+
# outputs/gguf/frankenstallm-3b-Q5_K_M.gguf
|
| 24 |
+
# outputs/gguf/frankenstallm-3b-Q8_0.gguf
|
| 25 |
+
#
|
| 26 |
+
# 전제 조건:
|
| 27 |
+
# - python scripts/convert_to_hf.py 로 HF 변환 완료 (또는 --checkpoint 옵션)
|
| 28 |
+
# - git, cmake, make 설치
|
| 29 |
+
# - pip install safetensors
|
| 30 |
+
# =============================================================================
|
| 31 |
+
set -euo pipefail
|
| 32 |
+
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
# 인자 파싱
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
INPUT_DIR="outputs/hf_korean_3b_orpo"
|
| 37 |
+
OUT_DIR="outputs/gguf"
|
| 38 |
+
CHECKPOINT_DIR=""
|
| 39 |
+
SKIP_HF_CONV=false
|
| 40 |
+
SKIP_QUANT=false
|
| 41 |
+
|
| 42 |
+
while [[ $# -gt 0 ]]; do
|
| 43 |
+
case "$1" in
|
| 44 |
+
--input_dir) INPUT_DIR="$2"; shift 2 ;;
|
| 45 |
+
--out_dir) OUT_DIR="$2"; shift 2 ;;
|
| 46 |
+
--checkpoint) CHECKPOINT_DIR="$2"; shift 2 ;;
|
| 47 |
+
--skip_hf_conv) SKIP_HF_CONV=true; shift ;;
|
| 48 |
+
--skip_quant) SKIP_QUANT=true; shift ;;
|
| 49 |
+
-h|--help)
|
| 50 |
+
grep '^#' "$0" | head -40 | sed 's/^# \{0,1\}//'
|
| 51 |
+
exit 0 ;;
|
| 52 |
+
*)
|
| 53 |
+
echo "ERROR: 알 수 없는 옵션: $1"
|
| 54 |
+
echo "Usage: bash scripts/convert_3b_gguf.sh [--input_dir DIR] [--out_dir DIR] [--checkpoint DIR] [--skip_hf_conv] [--skip_quant]"
|
| 55 |
+
exit 1 ;;
|
| 56 |
+
esac
|
| 57 |
+
done
|
| 58 |
+
|
| 59 |
+
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 60 |
+
LLAMA_CPP_DIR="${LLAMA_CPP_DIR:-$PROJECT_DIR/outputs/llama.cpp}"
|
| 61 |
+
MODEL_NAME="frankenstallm-3b"
|
| 62 |
+
|
| 63 |
+
cd "$PROJECT_DIR"
|
| 64 |
+
|
| 65 |
+
echo "=================================================================="
|
| 66 |
+
echo " 3B 모델 GGUF 변환 파이프라인"
|
| 67 |
+
echo " 입력 HF 디렉토리 : $INPUT_DIR"
|
| 68 |
+
echo " GGUF 출력 디렉토리: $OUT_DIR"
|
| 69 |
+
echo " llama.cpp 경로 : $LLAMA_CPP_DIR"
|
| 70 |
+
echo "=================================================================="
|
| 71 |
+
echo ""
|
| 72 |
+
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
# Step 0: llama.cpp 존재 여부 확인 / 클론
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
if [[ ! -d "$LLAMA_CPP_DIR" ]]; then
|
| 77 |
+
echo "[SETUP] llama.cpp 디렉토리가 없습니다."
|
| 78 |
+
echo " 다음 명령으로 설치하세요:"
|
| 79 |
+
echo ""
|
| 80 |
+
echo " git clone --depth 1 https://github.com/ggerganov/llama.cpp $LLAMA_CPP_DIR"
|
| 81 |
+
echo ""
|
| 82 |
+
echo " 또는 LLAMA_CPP_DIR 환경변수로 기존 경로를 지정하세요:"
|
| 83 |
+
echo " LLAMA_CPP_DIR=/path/to/llama.cpp bash scripts/convert_3b_gguf.sh"
|
| 84 |
+
echo ""
|
| 85 |
+
read -r -p "지금 자동 클론하시겠습니까? [y/N] " _yn
|
| 86 |
+
if [[ "${_yn:-N}" =~ ^[Yy]$ ]]; then
|
| 87 |
+
echo "Cloning llama.cpp ..."
|
| 88 |
+
git clone --depth 1 https://github.com/ggerganov/llama.cpp "$LLAMA_CPP_DIR"
|
| 89 |
+
else
|
| 90 |
+
echo "중단합니다. llama.cpp를 설치한 뒤 다시 실행하세요."
|
| 91 |
+
exit 1
|
| 92 |
+
fi
|
| 93 |
+
fi
|
| 94 |
+
|
| 95 |
+
# llama.cpp Python 의존성
|
| 96 |
+
echo "[SETUP] llama.cpp Python 의존성 설치 중 ..."
|
| 97 |
+
pip install -r "$LLAMA_CPP_DIR/requirements.txt" --break-system-packages -q
|
| 98 |
+
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
# Step 1: 커스텀 체크포인트 → HF 포맷 변환 (선택)
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
if [[ -n "$CHECKPOINT_DIR" && "$SKIP_HF_CONV" == "false" ]]; then
|
| 103 |
+
echo ""
|
| 104 |
+
echo "[STEP 1] 커스텀 체크포인트 → HF 포맷 변환"
|
| 105 |
+
echo " 체크포인트: $CHECKPOINT_DIR"
|
| 106 |
+
echo " 출력 : $INPUT_DIR"
|
| 107 |
+
echo ""
|
| 108 |
+
|
| 109 |
+
if [[ ! -d "$CHECKPOINT_DIR" ]]; then
|
| 110 |
+
echo "ERROR: 체크포인트 디렉토리를 찾을 수 없습니다: $CHECKPOINT_DIR"
|
| 111 |
+
exit 1
|
| 112 |
+
fi
|
| 113 |
+
|
| 114 |
+
python "$PROJECT_DIR/scripts/convert_to_hf.py" \
|
| 115 |
+
--checkpoint "$CHECKPOINT_DIR" \
|
| 116 |
+
--output "$INPUT_DIR" \
|
| 117 |
+
--tokenizer "tokenizer/korean_sp/tokenizer.json"
|
| 118 |
+
|
| 119 |
+
echo " [OK] HF 변환 완료 → $INPUT_DIR"
|
| 120 |
+
elif [[ "$SKIP_HF_CONV" == "true" ]]; then
|
| 121 |
+
echo "[STEP 1] HF 변환 건너뜀 (--skip_hf_conv)"
|
| 122 |
+
else
|
| 123 |
+
echo "[STEP 1] 체크포인트 미지정 — HF 디렉토리를 직접 사용합니다."
|
| 124 |
+
fi
|
| 125 |
+
|
| 126 |
+
# HF 디렉토리 최종 검증
|
| 127 |
+
if [[ ! -d "$INPUT_DIR" ]]; then
|
| 128 |
+
echo "ERROR: HF 모델 디렉토리를 찾을 수 없습니다: $INPUT_DIR"
|
| 129 |
+
echo " --checkpoint 옵션으로 체크포인트를 지정하거나,"
|
| 130 |
+
echo " python scripts/convert_to_hf.py 를 먼저 실행하세요."
|
| 131 |
+
exit 1
|
| 132 |
+
fi
|
| 133 |
+
|
| 134 |
+
if [[ ! -f "$INPUT_DIR/config.json" ]]; then
|
| 135 |
+
echo "ERROR: config.json 이 없습니다: $INPUT_DIR/config.json"
|
| 136 |
+
exit 1
|
| 137 |
+
fi
|
| 138 |
+
|
| 139 |
+
mkdir -p "$OUT_DIR"
|
| 140 |
+
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
# Step 2: llama.cpp 빌드 (llama-quantize 바이너리)
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
QUANTIZE_BIN="$LLAMA_CPP_DIR/build/bin/llama-quantize"
|
| 145 |
+
|
| 146 |
+
if [[ ! -f "$QUANTIZE_BIN" ]]; then
|
| 147 |
+
echo ""
|
| 148 |
+
echo "[STEP 2] llama.cpp 빌드 중 (llama-quantize) ..."
|
| 149 |
+
cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" \
|
| 150 |
+
-DCMAKE_BUILD_TYPE=Release \
|
| 151 |
+
-DGGML_CUDA=ON \
|
| 152 |
+
2>&1 | tail -10
|
| 153 |
+
cmake --build "$LLAMA_CPP_DIR/build" --target llama-quantize -j "$(nproc)" \
|
| 154 |
+
2>&1 | tail -10
|
| 155 |
+
echo " [OK] 빌드 완료: $QUANTIZE_BIN"
|
| 156 |
+
else
|
| 157 |
+
echo "[STEP 2] llama-quantize 바이너리 이미 존재 — 빌드 건너뜀"
|
| 158 |
+
fi
|
| 159 |
+
|
| 160 |
+
# ---------------------------------------------------------------------------
|
| 161 |
+
# Step 3: HF → F16 GGUF 변환
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
F16_GGUF="$OUT_DIR/${MODEL_NAME}-f16.gguf"
|
| 164 |
+
|
| 165 |
+
echo ""
|
| 166 |
+
echo "[STEP 3] HF → F16 GGUF 변환"
|
| 167 |
+
echo " 입력: $INPUT_DIR"
|
| 168 |
+
echo " 출력: $F16_GGUF"
|
| 169 |
+
echo ""
|
| 170 |
+
|
| 171 |
+
python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" "$INPUT_DIR" \
|
| 172 |
+
--outfile "$F16_GGUF" \
|
| 173 |
+
--outtype f16
|
| 174 |
+
|
| 175 |
+
echo " [OK] F16 GGUF 크기: $(du -sh "$F16_GGUF" | cut -f1) ($F16_GGUF)"
|
| 176 |
+
|
| 177 |
+
# ---------------------------------------------------------------------------
|
| 178 |
+
# Step 4: 다중 양자화 (Q4_K_M, Q5_K_M, Q8_0)
|
| 179 |
+
# ---------------------------------------------------------------------------
|
| 180 |
+
if [[ "$SKIP_QUANT" == "true" ]]; then
|
| 181 |
+
echo ""
|
| 182 |
+
echo "[STEP 4] 양자화 건너뜀 (--skip_quant)"
|
| 183 |
+
else
|
| 184 |
+
echo ""
|
| 185 |
+
echo "[STEP 4] 다중 양자화 시작 ..."
|
| 186 |
+
|
| 187 |
+
if [[ ! -f "$QUANTIZE_BIN" ]]; then
|
| 188 |
+
echo "[WARN] llama-quantize 바이너리를 찾을 수 없습니다: $QUANTIZE_BIN"
|
| 189 |
+
echo " 양자화를 건너뜁니다. F16 GGUF만 생성되었습니다."
|
| 190 |
+
echo " 수동 빌드: cmake --build $LLAMA_CPP_DIR/build --target llama-quantize"
|
| 191 |
+
else
|
| 192 |
+
# Q4_K_M — 가장 작은 크기, 품질/속도 균형 (Ollama 기본 권장)
|
| 193 |
+
Q4KM_GGUF="$OUT_DIR/${MODEL_NAME}-Q4_K_M.gguf"
|
| 194 |
+
echo " → Q4_K_M 양자화: $Q4KM_GGUF ..."
|
| 195 |
+
"$QUANTIZE_BIN" "$F16_GGUF" "$Q4KM_GGUF" Q4_K_M
|
| 196 |
+
echo " 크기: $(du -sh "$Q4KM_GGUF" | cut -f1)"
|
| 197 |
+
|
| 198 |
+
# Q5_K_M — 중간 크기, 더 높은 품질
|
| 199 |
+
Q5KM_GGUF="$OUT_DIR/${MODEL_NAME}-Q5_K_M.gguf"
|
| 200 |
+
echo " → Q5_K_M 양자화: $Q5KM_GGUF ..."
|
| 201 |
+
"$QUANTIZE_BIN" "$F16_GGUF" "$Q5KM_GGUF" Q5_K_M
|
| 202 |
+
echo " 크기: $(du -sh "$Q5KM_GGUF" | cut -f1)"
|
| 203 |
+
|
| 204 |
+
# Q8_0 — 가장 높은 품질 (F16 근사)
|
| 205 |
+
Q8_GGUF="$OUT_DIR/${MODEL_NAME}-Q8_0.gguf"
|
| 206 |
+
echo " → Q8_0 양자화: $Q8_GGUF ..."
|
| 207 |
+
"$QUANTIZE_BIN" "$F16_GGUF" "$Q8_GGUF" Q8_0
|
| 208 |
+
echo " 크기: $(du -sh "$Q8_GGUF" | cut -f1)"
|
| 209 |
+
|
| 210 |
+
echo ""
|
| 211 |
+
echo " [OK] 모든 양자화 완료"
|
| 212 |
+
fi
|
| 213 |
+
fi
|
| 214 |
+
|
| 215 |
+
# ---------------------------------------------------------------------------
|
| 216 |
+
# 완료 요약
|
| 217 |
+
# ---------------------------------------------------------------------------
|
| 218 |
+
echo ""
|
| 219 |
+
echo "=================================================================="
|
| 220 |
+
echo " 3B GGUF 변환 완료"
|
| 221 |
+
echo ""
|
| 222 |
+
echo " 출력 파일 목록:"
|
| 223 |
+
ls -lh "$OUT_DIR/${MODEL_NAME}"*.gguf 2>/dev/null | awk '{print " " $5 " " $9}' || \
|
| 224 |
+
echo " (파일 목록 확인: ls -lh $OUT_DIR/)"
|
| 225 |
+
echo ""
|
| 226 |
+
echo " 다음 단계:"
|
| 227 |
+
echo " bash scripts/deploy_3b_ollama.sh"
|
| 228 |
+
echo " bash scripts/quality_gate.sh deploy"
|
| 229 |
+
echo "=================================================================="
|
source/scripts/convert_to_gguf.sh
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# convert_to_gguf.sh — HuggingFace 포맷 모델을 GGUF로 변환 + Q4_K_M 양자화
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/convert_to_gguf.sh [hf_dir] [out_dir]
|
| 7 |
+
#
|
| 8 |
+
# hf_dir : HF 포맷 모델 디렉토리 (default: outputs/hf)
|
| 9 |
+
# out_dir : GGUF 출력 디렉토리 (default: outputs/gguf)
|
| 10 |
+
#
|
| 11 |
+
# Outputs:
|
| 12 |
+
# outputs/gguf/korean-1b-f16.gguf — F16 GGUF
|
| 13 |
+
# outputs/gguf/korean-1b-q4km.gguf — Q4_K_M 양자화 (Ollama용)
|
| 14 |
+
#
|
| 15 |
+
# 전제 조건:
|
| 16 |
+
# - python scripts/convert_to_hf.py 로 HF 변환 완료
|
| 17 |
+
# - git, cmake, make 설치
|
| 18 |
+
# - pip install safetensors (없으면 pytorch_model.bin으로 fallback)
|
| 19 |
+
# =============================================================================
|
| 20 |
+
set -euo pipefail
|
| 21 |
+
|
| 22 |
+
HF_DIR="${1:-outputs/hf}"
|
| 23 |
+
OUT_DIR="${2:-outputs/gguf}"
|
| 24 |
+
LLAMA_CPP_DIR="outputs/llama.cpp"
|
| 25 |
+
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 26 |
+
|
| 27 |
+
cd "$PROJECT_DIR"
|
| 28 |
+
|
| 29 |
+
# --- Pre-flight check -------------------------------------------------------
|
| 30 |
+
if [[ ! -d "$HF_DIR" ]]; then
|
| 31 |
+
echo "ERROR: HF model directory not found: $HF_DIR"
|
| 32 |
+
echo "Run first: python scripts/convert_to_hf.py --checkpoint <ckpt> --output $HF_DIR"
|
| 33 |
+
exit 1
|
| 34 |
+
fi
|
| 35 |
+
|
| 36 |
+
if [[ ! -f "$HF_DIR/config.json" ]]; then
|
| 37 |
+
echo "ERROR: config.json not found in $HF_DIR"
|
| 38 |
+
exit 1
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
mkdir -p "$OUT_DIR"
|
| 42 |
+
|
| 43 |
+
# --- Clone llama.cpp if not present -----------------------------------------
|
| 44 |
+
if [[ ! -d "$LLAMA_CPP_DIR" ]]; then
|
| 45 |
+
echo "Cloning llama.cpp ..."
|
| 46 |
+
git clone --depth 1 https://github.com/ggerganov/llama.cpp "$LLAMA_CPP_DIR"
|
| 47 |
+
fi
|
| 48 |
+
|
| 49 |
+
# Install Python requirements for conversion script
|
| 50 |
+
echo "Installing llama.cpp Python deps ..."
|
| 51 |
+
pip install -r "$LLAMA_CPP_DIR/requirements.txt" --break-system-packages -q
|
| 52 |
+
|
| 53 |
+
# --- Build llama.cpp (for quantization binary) ------------------------------
|
| 54 |
+
QUANTIZE_BIN="$LLAMA_CPP_DIR/build/bin/llama-quantize"
|
| 55 |
+
if [[ ! -f "$QUANTIZE_BIN" ]]; then
|
| 56 |
+
echo "Building llama.cpp (quantization tool) ..."
|
| 57 |
+
cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" \
|
| 58 |
+
-DCMAKE_BUILD_TYPE=Release \
|
| 59 |
+
-DGGML_CUDA=ON \
|
| 60 |
+
2>&1 | tail -5
|
| 61 |
+
cmake --build "$LLAMA_CPP_DIR/build" --target llama-quantize -j "$(nproc)" \
|
| 62 |
+
2>&1 | tail -5
|
| 63 |
+
fi
|
| 64 |
+
|
| 65 |
+
# --- F16 GGUF conversion ---------------------------------------------------
|
| 66 |
+
F16_GGUF="$OUT_DIR/korean-1b-f16.gguf"
|
| 67 |
+
echo "Converting to F16 GGUF: $F16_GGUF ..."
|
| 68 |
+
python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" "$HF_DIR" \
|
| 69 |
+
--outfile "$F16_GGUF" \
|
| 70 |
+
--outtype f16
|
| 71 |
+
|
| 72 |
+
echo "F16 GGUF size: $(du -sh "$F16_GGUF" | cut -f1)"
|
| 73 |
+
|
| 74 |
+
# --- Q4_K_M quantization ---------------------------------------------------
|
| 75 |
+
Q4KM_GGUF="$OUT_DIR/korean-1b-q4km.gguf"
|
| 76 |
+
if [[ -f "$QUANTIZE_BIN" ]]; then
|
| 77 |
+
echo "Quantizing to Q4_K_M: $Q4KM_GGUF ..."
|
| 78 |
+
"$QUANTIZE_BIN" "$F16_GGUF" "$Q4KM_GGUF" Q4_K_M
|
| 79 |
+
echo "Q4_K_M GGUF size: $(du -sh "$Q4KM_GGUF" | cut -f1)"
|
| 80 |
+
else
|
| 81 |
+
echo "[WARN] llama-quantize binary not found. Using F16 GGUF for Ollama."
|
| 82 |
+
echo " Build: cmake --build $LLAMA_CPP_DIR/build --target llama-quantize"
|
| 83 |
+
cp "$F16_GGUF" "$Q4KM_GGUF"
|
| 84 |
+
fi
|
| 85 |
+
|
| 86 |
+
echo ""
|
| 87 |
+
echo "=================================================================="
|
| 88 |
+
echo " GGUF 변환 완료"
|
| 89 |
+
echo " F16 : $F16_GGUF"
|
| 90 |
+
echo " Q4KM: $Q4KM_GGUF"
|
| 91 |
+
echo " 다음 단계: bash scripts/deploy_ollama.sh"
|
| 92 |
+
echo "=================================================================="
|
source/scripts/convert_to_hf.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Convert custom LLM checkpoint to HuggingFace LlamaForCausalLM format.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python scripts/convert_to_hf.py \\
|
| 6 |
+
--checkpoint checkpoints/korean_1b_fp8_run1/checkpoint-0034000 \\
|
| 7 |
+
--output outputs/hf \\
|
| 8 |
+
[--tokenizer tokenizer/korean_sp/tokenizer.json]
|
| 9 |
+
|
| 10 |
+
Outputs (in --output directory):
|
| 11 |
+
config.json — LlamaConfig
|
| 12 |
+
model.safetensors — converted weights
|
| 13 |
+
tokenizer.json — tokenizer (copied)
|
| 14 |
+
tokenizer_config.json
|
| 15 |
+
generation_config.json
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
import shutil
|
| 23 |
+
import sys
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
import torch
|
| 27 |
+
|
| 28 |
+
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 29 |
+
if str(_PROJECT_ROOT) not in sys.path:
|
| 30 |
+
sys.path.insert(0, str(_PROJECT_ROOT))
|
| 31 |
+
|
| 32 |
+
from model.config import LMConfig
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def remap_weights(
|
| 36 |
+
src_state_dict: dict,
|
| 37 |
+
config: LMConfig,
|
| 38 |
+
) -> dict:
|
| 39 |
+
"""
|
| 40 |
+
Remap custom LLM weight names to HuggingFace LlamaForCausalLM names.
|
| 41 |
+
|
| 42 |
+
Handles both FP8 (te.LayerNormMLP / te.Linear) and BF16 (SwiGLU / nn.Linear)
|
| 43 |
+
checkpoints transparently.
|
| 44 |
+
"""
|
| 45 |
+
dst = {}
|
| 46 |
+
is_fp8 = config.use_fp8
|
| 47 |
+
|
| 48 |
+
# --- Token embedding ---
|
| 49 |
+
dst["model.embed_tokens.weight"] = src_state_dict["embedding.weight"].float()
|
| 50 |
+
|
| 51 |
+
for i in range(config.n_layers):
|
| 52 |
+
pfx = f"layers.{i}"
|
| 53 |
+
hpfx = f"model.layers.{i}"
|
| 54 |
+
|
| 55 |
+
# Attention norm (always RMSNorm)
|
| 56 |
+
dst[f"{hpfx}.input_layernorm.weight"] = (
|
| 57 |
+
src_state_dict[f"{pfx}.attn_norm.weight"].float()
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Attention projections
|
| 61 |
+
# Handle fused QKV (te.Linear with qkv_proj) vs separate q/k/v
|
| 62 |
+
qkv_key = f"{pfx}.attn.qkv_proj.weight"
|
| 63 |
+
if qkv_key in src_state_dict:
|
| 64 |
+
# Fused QKV: [Q_dim + K_dim + V_dim, d_model]
|
| 65 |
+
# GQA: Q = n_heads * head_dim, K = V = n_kv_heads * head_dim
|
| 66 |
+
qkv = src_state_dict[qkv_key].float()
|
| 67 |
+
head_dim = config.d_model // config.n_heads
|
| 68 |
+
q_dim = config.n_heads * head_dim # e.g. 24 * 128 = 3072
|
| 69 |
+
k_dim = config.n_kv_heads * head_dim # e.g. 8 * 128 = 1024
|
| 70 |
+
v_dim = config.n_kv_heads * head_dim # e.g. 8 * 128 = 1024
|
| 71 |
+
assert qkv.shape[0] == q_dim + k_dim + v_dim, (
|
| 72 |
+
f"QKV shape mismatch: {qkv.shape[0]} != {q_dim}+{k_dim}+{v_dim}"
|
| 73 |
+
)
|
| 74 |
+
dst[f"{hpfx}.self_attn.q_proj.weight"] = qkv[:q_dim]
|
| 75 |
+
dst[f"{hpfx}.self_attn.k_proj.weight"] = qkv[q_dim:q_dim + k_dim]
|
| 76 |
+
dst[f"{hpfx}.self_attn.v_proj.weight"] = qkv[q_dim + k_dim:]
|
| 77 |
+
else:
|
| 78 |
+
# Separate q/k/v projections
|
| 79 |
+
for src_name, dst_name in [
|
| 80 |
+
("q_proj", "self_attn.q_proj"),
|
| 81 |
+
("k_proj", "self_attn.k_proj"),
|
| 82 |
+
("v_proj", "self_attn.v_proj"),
|
| 83 |
+
]:
|
| 84 |
+
w_key = f"{pfx}.attn.{src_name}.weight"
|
| 85 |
+
if w_key in src_state_dict:
|
| 86 |
+
dst[f"{hpfx}.{dst_name}.weight"] = src_state_dict[w_key].float()
|
| 87 |
+
|
| 88 |
+
# Output projection
|
| 89 |
+
out_key = f"{pfx}.attn.out_proj.weight"
|
| 90 |
+
if out_key in src_state_dict:
|
| 91 |
+
dst[f"{hpfx}.self_attn.o_proj.weight"] = src_state_dict[out_key].float()
|
| 92 |
+
|
| 93 |
+
# FFN — FP8 (te.LayerNormMLP) vs BF16 (SwiGLU)
|
| 94 |
+
if is_fp8 and f"{pfx}.ffn.layer_norm_weight" in src_state_dict:
|
| 95 |
+
# te.LayerNormMLP: RMSNorm is fused inside
|
| 96 |
+
dst[f"{hpfx}.post_attention_layernorm.weight"] = (
|
| 97 |
+
src_state_dict[f"{pfx}.ffn.layer_norm_weight"].float()
|
| 98 |
+
)
|
| 99 |
+
# fc1_weight: [2*d_ffn, d_model] — gate and up are concatenated
|
| 100 |
+
fc1 = src_state_dict[f"{pfx}.ffn.fc1_weight"].float()
|
| 101 |
+
half = fc1.shape[0] // 2
|
| 102 |
+
dst[f"{hpfx}.mlp.gate_proj.weight"] = fc1[:half]
|
| 103 |
+
dst[f"{hpfx}.mlp.up_proj.weight"] = fc1[half:]
|
| 104 |
+
# fc2_weight: [d_model, d_ffn]
|
| 105 |
+
dst[f"{hpfx}.mlp.down_proj.weight"] = (
|
| 106 |
+
src_state_dict[f"{pfx}.ffn.fc2_weight"].float()
|
| 107 |
+
)
|
| 108 |
+
else:
|
| 109 |
+
# Standard SwiGLU (BF16 checkpoint)
|
| 110 |
+
dst[f"{hpfx}.post_attention_layernorm.weight"] = (
|
| 111 |
+
src_state_dict[f"{pfx}.ffn_norm.weight"].float()
|
| 112 |
+
)
|
| 113 |
+
dst[f"{hpfx}.mlp.gate_proj.weight"] = (
|
| 114 |
+
src_state_dict[f"{pfx}.ffn.gate_proj.weight"].float()
|
| 115 |
+
)
|
| 116 |
+
dst[f"{hpfx}.mlp.up_proj.weight"] = (
|
| 117 |
+
src_state_dict[f"{pfx}.ffn.up_proj.weight"].float()
|
| 118 |
+
)
|
| 119 |
+
dst[f"{hpfx}.mlp.down_proj.weight"] = (
|
| 120 |
+
src_state_dict[f"{pfx}.ffn.down_proj.weight"].float()
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# --- Final norm and LM head ---
|
| 124 |
+
dst["model.norm.weight"] = src_state_dict["norm.weight"].float()
|
| 125 |
+
# Weight tying: embedding.weight == lm_head.weight in our model.
|
| 126 |
+
# HF LlamaForCausalLM expects lm_head.weight explicitly.
|
| 127 |
+
dst["lm_head.weight"] = src_state_dict["embedding.weight"].float().clone()
|
| 128 |
+
|
| 129 |
+
return dst
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def build_llama_config(config: LMConfig) -> dict:
|
| 133 |
+
"""Map LMConfig fields to HuggingFace LlamaConfig dict."""
|
| 134 |
+
return {
|
| 135 |
+
"architectures": ["LlamaForCausalLM"],
|
| 136 |
+
"model_type": "llama",
|
| 137 |
+
"hidden_size": config.d_model,
|
| 138 |
+
"intermediate_size": config.d_ffn,
|
| 139 |
+
"num_hidden_layers": config.n_layers,
|
| 140 |
+
"num_attention_heads": config.n_heads,
|
| 141 |
+
"num_key_value_heads": config.n_kv_heads,
|
| 142 |
+
"hidden_act": "silu",
|
| 143 |
+
"max_position_embeddings": config.max_seq_len,
|
| 144 |
+
"initializer_range": 0.02,
|
| 145 |
+
"rms_norm_eps": 1e-5,
|
| 146 |
+
"vocab_size": config.vocab_size,
|
| 147 |
+
"rope_theta": config.rope_theta,
|
| 148 |
+
"rope_scaling": None,
|
| 149 |
+
"attention_bias": config.bias,
|
| 150 |
+
"tie_word_embeddings": True,
|
| 151 |
+
"torch_dtype": "float16",
|
| 152 |
+
"transformers_version": "4.40.0",
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def main() -> None:
|
| 157 |
+
parser = argparse.ArgumentParser(
|
| 158 |
+
description="Convert custom LLM checkpoint to HuggingFace LlamaForCausalLM format."
|
| 159 |
+
)
|
| 160 |
+
parser.add_argument(
|
| 161 |
+
"--checkpoint",
|
| 162 |
+
required=True,
|
| 163 |
+
type=Path,
|
| 164 |
+
help="Path to checkpoint directory (must contain model.pt + config.yaml).",
|
| 165 |
+
)
|
| 166 |
+
parser.add_argument(
|
| 167 |
+
"--output",
|
| 168 |
+
required=True,
|
| 169 |
+
type=Path,
|
| 170 |
+
help="Output directory for HF-format files.",
|
| 171 |
+
)
|
| 172 |
+
parser.add_argument(
|
| 173 |
+
"--tokenizer",
|
| 174 |
+
type=Path,
|
| 175 |
+
default=Path("tokenizer/korean_sp/tokenizer.json"),
|
| 176 |
+
help="Path to tokenizer.json (default: tokenizer/korean_sp/tokenizer.json).",
|
| 177 |
+
)
|
| 178 |
+
args = parser.parse_args()
|
| 179 |
+
|
| 180 |
+
ckpt_path = args.checkpoint
|
| 181 |
+
out_path = args.output
|
| 182 |
+
|
| 183 |
+
if not ckpt_path.exists():
|
| 184 |
+
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
|
| 185 |
+
|
| 186 |
+
out_path.mkdir(parents=True, exist_ok=True)
|
| 187 |
+
print(f"Checkpoint : {ckpt_path}")
|
| 188 |
+
print(f"Output : {out_path}")
|
| 189 |
+
|
| 190 |
+
# Load config
|
| 191 |
+
config = LMConfig.from_yaml(ckpt_path / "config.yaml")
|
| 192 |
+
print(f"Model : d_model={config.d_model}, n_layers={config.n_layers}, "
|
| 193 |
+
f"vocab_size={config.vocab_size}, use_fp8={config.use_fp8}")
|
| 194 |
+
|
| 195 |
+
# Load weights
|
| 196 |
+
print("Loading model.pt ...")
|
| 197 |
+
state_dict = torch.load(
|
| 198 |
+
ckpt_path / "model.pt",
|
| 199 |
+
map_location="cpu",
|
| 200 |
+
weights_only=True,
|
| 201 |
+
)
|
| 202 |
+
print(f" Source keys: {len(state_dict)}")
|
| 203 |
+
|
| 204 |
+
# Remap
|
| 205 |
+
print("Remapping weight names ...")
|
| 206 |
+
hf_state_dict = remap_weights(state_dict, config)
|
| 207 |
+
print(f" Destination keys: {len(hf_state_dict)}")
|
| 208 |
+
|
| 209 |
+
# Save safetensors
|
| 210 |
+
print("Saving model.safetensors ...")
|
| 211 |
+
try:
|
| 212 |
+
from safetensors.torch import save_file
|
| 213 |
+
save_file(hf_state_dict, out_path / "model.safetensors")
|
| 214 |
+
except ImportError:
|
| 215 |
+
print(" [WARN] safetensors not installed; falling back to pytorch_model.bin")
|
| 216 |
+
torch.save(hf_state_dict, out_path / "pytorch_model.bin")
|
| 217 |
+
|
| 218 |
+
# Save config.json
|
| 219 |
+
llama_cfg = build_llama_config(config)
|
| 220 |
+
with open(out_path / "config.json", "w", encoding="utf-8") as f:
|
| 221 |
+
json.dump(llama_cfg, f, indent=2, ensure_ascii=False)
|
| 222 |
+
print("Saved config.json")
|
| 223 |
+
|
| 224 |
+
# Save generation_config.json
|
| 225 |
+
gen_cfg = {
|
| 226 |
+
"bos_token_id": 1,
|
| 227 |
+
"eos_token_id": 2,
|
| 228 |
+
"pad_token_id": 0,
|
| 229 |
+
"max_new_tokens": 512,
|
| 230 |
+
"temperature": 0.8,
|
| 231 |
+
"top_p": 0.9,
|
| 232 |
+
"do_sample": True,
|
| 233 |
+
}
|
| 234 |
+
with open(out_path / "generation_config.json", "w", encoding="utf-8") as f:
|
| 235 |
+
json.dump(gen_cfg, f, indent=2, ensure_ascii=False)
|
| 236 |
+
|
| 237 |
+
# Copy tokenizer
|
| 238 |
+
tok_src = args.tokenizer
|
| 239 |
+
if tok_src.exists():
|
| 240 |
+
shutil.copy(tok_src, out_path / "tokenizer.json")
|
| 241 |
+
# Minimal tokenizer_config.json for HF compatibility
|
| 242 |
+
tok_cfg = {
|
| 243 |
+
"model_type": "llama",
|
| 244 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 245 |
+
"bos_token": "<s>",
|
| 246 |
+
"eos_token": "</s>",
|
| 247 |
+
"unk_token": "<unk>",
|
| 248 |
+
"pad_token": "<pad>",
|
| 249 |
+
"clean_up_tokenization_spaces": False,
|
| 250 |
+
}
|
| 251 |
+
with open(out_path / "tokenizer_config.json", "w", encoding="utf-8") as f:
|
| 252 |
+
json.dump(tok_cfg, f, indent=2, ensure_ascii=False)
|
| 253 |
+
print(f"Copied tokenizer: {tok_src} -> {out_path / 'tokenizer.json'}")
|
| 254 |
+
else:
|
| 255 |
+
print(f"[WARN] Tokenizer not found at {tok_src}. Copy manually.")
|
| 256 |
+
|
| 257 |
+
print(f"\nDone! HF model saved to: {out_path}")
|
| 258 |
+
print("Verify: ls -lh", out_path)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
if __name__ == "__main__":
|
| 262 |
+
main()
|
source/scripts/deploy_3b_ollama.sh
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# deploy_3b_ollama.sh — 3B GGUF 모델을 Ollama에 등록 & 자동 테스트
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/deploy_3b_ollama.sh [model_name]
|
| 7 |
+
#
|
| 8 |
+
# model_name: Ollama 모델 이름 (default: frankenstallm-3b)
|
| 9 |
+
#
|
| 10 |
+
# 전제 조건:
|
| 11 |
+
# - ollama 설치: https://ollama.com/download
|
| 12 |
+
# - bash scripts/convert_3b_gguf.sh 실행 완료
|
| 13 |
+
# - outputs/gguf/frankenstallm-3b-Q4_K_M.gguf 존재
|
| 14 |
+
# - Modelfile.3b 존재
|
| 15 |
+
# =============================================================================
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
MODEL_NAME="${1:-frankenstallm-3b}"
|
| 19 |
+
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 20 |
+
MODELFILE="$PROJECT_DIR/Modelfile.3b"
|
| 21 |
+
GGUF_PATH="$PROJECT_DIR/outputs/gguf/frankenstallm-3b-Q4_K_M.gguf"
|
| 22 |
+
|
| 23 |
+
cd "$PROJECT_DIR"
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# Pre-flight check
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
if ! command -v ollama &> /dev/null; then
|
| 29 |
+
echo "ERROR: ollama가 설치되어 있지 않습니다."
|
| 30 |
+
echo "설치: curl -fsSL https://ollama.com/install.sh | sh"
|
| 31 |
+
exit 1
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
if [[ ! -f "$GGUF_PATH" ]]; then
|
| 35 |
+
echo "ERROR: GGUF 파일을 찾을 수 없습니다: $GGUF_PATH"
|
| 36 |
+
echo "먼저 실행: bash scripts/convert_3b_gguf.sh"
|
| 37 |
+
exit 1
|
| 38 |
+
fi
|
| 39 |
+
|
| 40 |
+
if [[ ! -f "$MODELFILE" ]]; then
|
| 41 |
+
echo "ERROR: Modelfile.3b 를 찾을 수 없습니다: $MODELFILE"
|
| 42 |
+
echo " 프로젝트 루트에 Modelfile.3b 가 있어야 합니다."
|
| 43 |
+
exit 1
|
| 44 |
+
fi
|
| 45 |
+
|
| 46 |
+
echo "=================================================================="
|
| 47 |
+
echo " 3B 모델 Ollama 배포"
|
| 48 |
+
echo " 모델명 : $MODEL_NAME"
|
| 49 |
+
echo " GGUF : $(du -sh "$GGUF_PATH" | cut -f1) ($GGUF_PATH)"
|
| 50 |
+
echo " Modelfile: $MODELFILE"
|
| 51 |
+
echo "=================================================================="
|
| 52 |
+
echo ""
|
| 53 |
+
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
# Ollama 서버 실행 확인
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
if ! ollama list &>/dev/null; then
|
| 58 |
+
echo "[WARN] Ollama 서버가 응답하지 않습니다. 백그라운드로 시작합니다 ..."
|
| 59 |
+
ollama serve &>/tmp/ollama_serve.log &
|
| 60 |
+
OLLAMA_PID=$!
|
| 61 |
+
echo " PID: $OLLAMA_PID (로그: /tmp/ollama_serve.log)"
|
| 62 |
+
# 서버 준비 대기 (최대 15초)
|
| 63 |
+
for i in $(seq 1 15); do
|
| 64 |
+
if ollama list &>/dev/null 2>&1; then
|
| 65 |
+
echo " [OK] Ollama 서버 준비 완료 (${i}초)"
|
| 66 |
+
break
|
| 67 |
+
fi
|
| 68 |
+
sleep 1
|
| 69 |
+
done
|
| 70 |
+
fi
|
| 71 |
+
|
| 72 |
+
# ---------------------------------------------------------------------------
|
| 73 |
+
# Ollama 모델 등록
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
echo "[1/2] Ollama 모델 등록 중: $MODEL_NAME ..."
|
| 76 |
+
ollama create "$MODEL_NAME" -f "$MODELFILE"
|
| 77 |
+
echo " [OK] 등록 완료"
|
| 78 |
+
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
# 자동 테스트 프롬프트 5개 실행
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
echo ""
|
| 83 |
+
echo "[2/2] 자동 테스트 프롬프트 실행 (5개) ..."
|
| 84 |
+
echo ""
|
| 85 |
+
|
| 86 |
+
declare -a TEST_PROMPTS=(
|
| 87 |
+
"안녕하세요! 간단히 자기소개를 해주세요."
|
| 88 |
+
"대한민국의 수도는 어디인가요? 그 도시의 특징을 설명해주세요."
|
| 89 |
+
"파이썬으로 피보나치 수열을 출력하는 함수를 작성해주세요."
|
| 90 |
+
"인공지능이 사회에 미치는 긍정적인 영향 3가지를 설명해주세요."
|
| 91 |
+
"오늘 저녁 메뉴로 무엇을 추천해주시겠어요? 이유도 함께 말씀해주세요."
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
PASS_COUNT=0
|
| 95 |
+
FAIL_COUNT=0
|
| 96 |
+
TOTAL=${#TEST_PROMPTS[@]}
|
| 97 |
+
|
| 98 |
+
for i in "${!TEST_PROMPTS[@]}"; do
|
| 99 |
+
PROMPT="${TEST_PROMPTS[$i]}"
|
| 100 |
+
NUM=$((i + 1))
|
| 101 |
+
echo "--- 테스트 $NUM/$TOTAL ---"
|
| 102 |
+
echo "프롬프트: $PROMPT"
|
| 103 |
+
echo ""
|
| 104 |
+
|
| 105 |
+
# ollama run: 타임아웃 60초, 응답 첫 300자만 표시
|
| 106 |
+
if RESPONSE=$(timeout 60 ollama run "$MODEL_NAME" "$PROMPT" 2>&1); then
|
| 107 |
+
RESP_PREVIEW="${RESPONSE:0:300}"
|
| 108 |
+
echo "응답: $RESP_PREVIEW"
|
| 109 |
+
if [[ ${#RESPONSE} -gt 300 ]]; then
|
| 110 |
+
echo " ... (총 ${#RESPONSE}자)"
|
| 111 |
+
fi
|
| 112 |
+
echo "[OK] 테스트 $NUM 성공"
|
| 113 |
+
PASS_COUNT=$((PASS_COUNT + 1))
|
| 114 |
+
else
|
| 115 |
+
EXIT_CODE=$?
|
| 116 |
+
echo "[FAIL] 테스트 $NUM 실패 (exit code: $EXIT_CODE)"
|
| 117 |
+
FAIL_COUNT=$((FAIL_COUNT + 1))
|
| 118 |
+
fi
|
| 119 |
+
echo ""
|
| 120 |
+
done
|
| 121 |
+
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
# 결과 요약
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
echo "=================================================================="
|
| 126 |
+
echo " 배포 & 테스트 완료"
|
| 127 |
+
echo ""
|
| 128 |
+
echo " 모델명 : $MODEL_NAME"
|
| 129 |
+
echo " 테스트 : $PASS_COUNT/$TOTAL 성공 ($FAIL_COUNT 실패)"
|
| 130 |
+
echo ""
|
| 131 |
+
if [[ $FAIL_COUNT -eq 0 ]]; then
|
| 132 |
+
echo " [PASS] 모든 테스트 통과"
|
| 133 |
+
else
|
| 134 |
+
echo " [WARN] 일부 테스트 실패 — 로그를 확인하��요"
|
| 135 |
+
fi
|
| 136 |
+
echo ""
|
| 137 |
+
echo " Ollama 사용법:"
|
| 138 |
+
echo " ollama run $MODEL_NAME"
|
| 139 |
+
echo " ollama run $MODEL_NAME '질문을 여기에 입력하세요'"
|
| 140 |
+
echo " ollama rm $MODEL_NAME (삭제)"
|
| 141 |
+
echo ""
|
| 142 |
+
echo " Quality Gate:"
|
| 143 |
+
echo " bash scripts/quality_gate.sh deploy"
|
| 144 |
+
echo "=================================================================="
|
| 145 |
+
|
| 146 |
+
[[ $FAIL_COUNT -gt 0 ]] && exit 1 || exit 0
|
source/scripts/deploy_ollama.sh
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# deploy_ollama.sh — FRANKENSTALLM 3B GGUF → Ollama 원클릭 배포
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/deploy_ollama.sh # 기본 (Q4_K_M)
|
| 7 |
+
# bash scripts/deploy_ollama.sh --quant Q8_0 # Q8_0 양자화
|
| 8 |
+
# bash scripts/deploy_ollama.sh --skip_convert # GGUF 이미 존재 시
|
| 9 |
+
#
|
| 10 |
+
# Pipeline:
|
| 11 |
+
# 1. [선택] GGUF 변환 + 양자화 (convert_3b_gguf.sh)
|
| 12 |
+
# 2. Ollama 설치 확인 / 서버 시작
|
| 13 |
+
# 3. Modelfile.3b로 모델 등록
|
| 14 |
+
# 4. 자동 테스트 (5개 프롬프트)
|
| 15 |
+
# 5. 반복률 검증 (15개 프롬프트)
|
| 16 |
+
# =============================================================================
|
| 17 |
+
set -euo pipefail
|
| 18 |
+
|
| 19 |
+
QUANT="${QUANT:-Q4_K_M}"
|
| 20 |
+
MODEL_NAME="frankenstallm-3b"
|
| 21 |
+
SKIP_CONVERT=false
|
| 22 |
+
|
| 23 |
+
while [[ $# -gt 0 ]]; do
|
| 24 |
+
case "$1" in
|
| 25 |
+
--quant) QUANT="$2"; shift 2 ;;
|
| 26 |
+
--skip_convert) SKIP_CONVERT=true; shift ;;
|
| 27 |
+
-h|--help)
|
| 28 |
+
grep '^#' "$0" | head -20 | sed 's/^# \{0,1\}//'
|
| 29 |
+
exit 0 ;;
|
| 30 |
+
*) echo "ERROR: 알 수 없는 옵션: $1"; exit 1 ;;
|
| 31 |
+
esac
|
| 32 |
+
done
|
| 33 |
+
|
| 34 |
+
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 35 |
+
cd "$PROJECT_DIR"
|
| 36 |
+
|
| 37 |
+
GGUF_PATH="outputs/gguf/frankenstallm-3b-${QUANT}.gguf"
|
| 38 |
+
MODELFILE="Modelfile.3b"
|
| 39 |
+
|
| 40 |
+
echo "=================================================================="
|
| 41 |
+
echo " FRANKENSTALLM 3B Ollama 배포"
|
| 42 |
+
echo " 양자화 : $QUANT"
|
| 43 |
+
echo " GGUF : $GGUF_PATH"
|
| 44 |
+
echo " Modelfile: $MODELFILE"
|
| 45 |
+
echo "=================================================================="
|
| 46 |
+
|
| 47 |
+
# ---- Step 1: GGUF 변환 (필요 시) ----
|
| 48 |
+
if [[ "$SKIP_CONVERT" == "false" ]]; then
|
| 49 |
+
if [[ ! -f "$GGUF_PATH" ]]; then
|
| 50 |
+
echo ""
|
| 51 |
+
echo "[Step 1] GGUF 변환 실행 중 ..."
|
| 52 |
+
bash scripts/convert_3b_gguf.sh \
|
| 53 |
+
--input_dir checkpoints/korean_3b_orpo_v1/checkpoint-9840
|
| 54 |
+
else
|
| 55 |
+
echo "[Step 1] GGUF 파일 이미 존재 — 변환 건너뜀"
|
| 56 |
+
fi
|
| 57 |
+
else
|
| 58 |
+
echo "[Step 1] 변환 건너뜀 (--skip_convert)"
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [[ ! -f "$GGUF_PATH" ]]; then
|
| 62 |
+
echo "ERROR: GGUF 파일 없음: $GGUF_PATH"
|
| 63 |
+
exit 1
|
| 64 |
+
fi
|
| 65 |
+
|
| 66 |
+
echo " GGUF 크기: $(du -sh "$GGUF_PATH" | cut -f1)"
|
| 67 |
+
|
| 68 |
+
# ---- Step 2: Ollama 설치 확인 ----
|
| 69 |
+
if ! command -v ollama &>/dev/null; then
|
| 70 |
+
echo ""
|
| 71 |
+
echo "[Step 2] Ollama 미설치 — 설치 중 ..."
|
| 72 |
+
curl -fsSL https://ollama.com/install.sh | sh
|
| 73 |
+
fi
|
| 74 |
+
|
| 75 |
+
# Ollama 서버 시작
|
| 76 |
+
if ! ollama list &>/dev/null 2>&1; then
|
| 77 |
+
echo "[Step 2] Ollama 서버 시작 중 ..."
|
| 78 |
+
ollama serve &>/tmp/ollama_serve.log &
|
| 79 |
+
for i in $(seq 1 15); do
|
| 80 |
+
if ollama list &>/dev/null 2>&1; then
|
| 81 |
+
echo " [OK] Ollama 서버 준비 (${i}초)"
|
| 82 |
+
break
|
| 83 |
+
fi
|
| 84 |
+
sleep 1
|
| 85 |
+
done
|
| 86 |
+
fi
|
| 87 |
+
|
| 88 |
+
# ---- Step 3: 모델 등록 ----
|
| 89 |
+
echo ""
|
| 90 |
+
echo "[Step 3] Ollama 모델 등록: $MODEL_NAME"
|
| 91 |
+
ollama create "$MODEL_NAME" -f "$MODELFILE"
|
| 92 |
+
echo " [OK] 등록 완료"
|
| 93 |
+
|
| 94 |
+
# ---- Step 4: 자동 테스트 ----
|
| 95 |
+
echo ""
|
| 96 |
+
echo "[Step 4] 자동 테스트 ..."
|
| 97 |
+
declare -a QUICK_TESTS=(
|
| 98 |
+
"대한민국의 수도는?"
|
| 99 |
+
"인공지능이란 무엇인가요?"
|
| 100 |
+
"한국의 전통 음식 중에서 김치에 대해 설명해주세요."
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
for prompt in "${QUICK_TESTS[@]}"; do
|
| 104 |
+
echo " Q: $prompt"
|
| 105 |
+
RESP=$(timeout 60 ollama run "$MODEL_NAME" "$prompt" 2>&1 || echo "[TIMEOUT/ERROR]")
|
| 106 |
+
echo " A: ${RESP:0:200}"
|
| 107 |
+
echo ""
|
| 108 |
+
done
|
| 109 |
+
|
| 110 |
+
# ---- Step 5: 반복률 검증 ----
|
| 111 |
+
echo "[Step 5] 반복률 검증 (15개 프롬프트) ..."
|
| 112 |
+
python3 scripts/test_ollama_repetition.py --model "$MODEL_NAME"
|
| 113 |
+
|
| 114 |
+
echo ""
|
| 115 |
+
echo "=================================================================="
|
| 116 |
+
echo " 배포 완료!"
|
| 117 |
+
echo " 사용법: ollama run $MODEL_NAME"
|
| 118 |
+
echo "=================================================================="
|
source/scripts/fix_tokenizer_byte_fallback.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Fix GGUF newline crash by adding byte-fallback tokens to the tokenizer.
|
| 3 |
+
|
| 4 |
+
Problem: The SentencePiece Unigram tokenizer was trained without byte_fallback=True,
|
| 5 |
+
so characters like \n have no token representation. llama.cpp crashes when it
|
| 6 |
+
encounters these characters because there's no byte-fallback.
|
| 7 |
+
|
| 8 |
+
Fix:
|
| 9 |
+
1. Add 256 byte-fallback tokens (<0x00> .. <0xFF>) to tokenizer.json
|
| 10 |
+
2. Resize model embeddings from 64000 -> 64256
|
| 11 |
+
3. Update config.json vocab_size
|
| 12 |
+
4. Copy tokenizer.model for proper GGUF conversion
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
python scripts/fix_tokenizer_byte_fallback.py \
|
| 16 |
+
--input outputs/hf_checkpoint-best \
|
| 17 |
+
--output outputs/hf_checkpoint-best-fixed \
|
| 18 |
+
--sp_model tokenizer/korean_sp/tokenizer.model
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import argparse
|
| 22 |
+
import json
|
| 23 |
+
import shutil
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
import torch
|
| 27 |
+
from safetensors.torch import load_file, save_file
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
BYTE_FALLBACK_COUNT = 256
|
| 31 |
+
BYTE_TOKEN_TEMPLATE = "<0x{:02X}>"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def fix_tokenizer_json(input_path: Path, output_path: Path):
|
| 35 |
+
"""Add byte_fallback=True and 256 byte tokens to tokenizer.json."""
|
| 36 |
+
with open(input_path) as f:
|
| 37 |
+
tok = json.load(f)
|
| 38 |
+
|
| 39 |
+
model = tok["model"]
|
| 40 |
+
vocab = model["vocab"] # list of [piece, score]
|
| 41 |
+
original_size = len(vocab)
|
| 42 |
+
|
| 43 |
+
# Enable byte_fallback
|
| 44 |
+
model["byte_fallback"] = True
|
| 45 |
+
|
| 46 |
+
# Add 256 byte tokens with very low score (they're fallback only)
|
| 47 |
+
for i in range(BYTE_FALLBACK_COUNT):
|
| 48 |
+
byte_token = BYTE_TOKEN_TEMPLATE.format(i)
|
| 49 |
+
vocab.append([byte_token, 0.0])
|
| 50 |
+
|
| 51 |
+
new_size = len(vocab)
|
| 52 |
+
print(f" Vocab: {original_size} -> {new_size} (+{BYTE_FALLBACK_COUNT} byte tokens)")
|
| 53 |
+
print(f" byte_fallback: False -> True")
|
| 54 |
+
|
| 55 |
+
# Also add byte tokens to added_tokens list
|
| 56 |
+
added = tok.get("added_tokens", [])
|
| 57 |
+
for i in range(BYTE_FALLBACK_COUNT):
|
| 58 |
+
byte_token = BYTE_TOKEN_TEMPLATE.format(i)
|
| 59 |
+
added.append({
|
| 60 |
+
"id": original_size + i,
|
| 61 |
+
"content": byte_token,
|
| 62 |
+
"single_word": False,
|
| 63 |
+
"lstrip": False,
|
| 64 |
+
"rstrip": False,
|
| 65 |
+
"normalized": False,
|
| 66 |
+
"special": True,
|
| 67 |
+
})
|
| 68 |
+
tok["added_tokens"] = added
|
| 69 |
+
|
| 70 |
+
with open(output_path, "w") as f:
|
| 71 |
+
json.dump(tok, f, ensure_ascii=False, indent=2)
|
| 72 |
+
|
| 73 |
+
return original_size, new_size
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def fix_config_json(input_path: Path, output_path: Path, new_vocab_size: int):
|
| 77 |
+
"""Update vocab_size in config.json."""
|
| 78 |
+
with open(input_path) as f:
|
| 79 |
+
config = json.load(f)
|
| 80 |
+
|
| 81 |
+
old_size = config["vocab_size"]
|
| 82 |
+
config["vocab_size"] = new_vocab_size
|
| 83 |
+
print(f" config.json vocab_size: {old_size} -> {new_vocab_size}")
|
| 84 |
+
|
| 85 |
+
with open(output_path, "w") as f:
|
| 86 |
+
json.dump(config, f, indent=2)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def resize_embeddings(input_path: Path, output_path: Path,
|
| 90 |
+
old_vocab: int, new_vocab: int, tie_embeddings: bool):
|
| 91 |
+
"""Resize embedding and lm_head weights to accommodate new tokens."""
|
| 92 |
+
print(f" Loading model weights from {input_path} ...")
|
| 93 |
+
state_dict = load_file(str(input_path))
|
| 94 |
+
|
| 95 |
+
embed_key = "model.embed_tokens.weight"
|
| 96 |
+
lm_head_key = "lm_head.weight"
|
| 97 |
+
|
| 98 |
+
if embed_key not in state_dict:
|
| 99 |
+
raise KeyError(f"{embed_key} not found in state_dict. Keys: {list(state_dict.keys())[:10]}")
|
| 100 |
+
|
| 101 |
+
embed = state_dict[embed_key]
|
| 102 |
+
print(f" embed_tokens shape: {embed.shape}")
|
| 103 |
+
|
| 104 |
+
hidden_size = embed.shape[1]
|
| 105 |
+
extra = new_vocab - old_vocab
|
| 106 |
+
|
| 107 |
+
# Initialize new embeddings as mean of existing (better than random for byte tokens)
|
| 108 |
+
mean_embed = embed.mean(dim=0, keepdim=True)
|
| 109 |
+
# Add small noise to avoid identical embeddings
|
| 110 |
+
noise = torch.randn(extra, hidden_size, dtype=embed.dtype) * 0.01
|
| 111 |
+
new_rows = mean_embed.expand(extra, -1) + noise
|
| 112 |
+
|
| 113 |
+
new_embed = torch.cat([embed, new_rows], dim=0)
|
| 114 |
+
state_dict[embed_key] = new_embed
|
| 115 |
+
print(f" embed_tokens resized: {embed.shape} -> {new_embed.shape}")
|
| 116 |
+
|
| 117 |
+
if tie_embeddings:
|
| 118 |
+
# When tie_word_embeddings=True, lm_head shares embed_tokens
|
| 119 |
+
# Remove lm_head if present (it will be tied automatically)
|
| 120 |
+
if lm_head_key in state_dict:
|
| 121 |
+
del state_dict[lm_head_key]
|
| 122 |
+
print(f" lm_head removed (tie_word_embeddings=True)")
|
| 123 |
+
else:
|
| 124 |
+
if lm_head_key in state_dict:
|
| 125 |
+
lm_head = state_dict[lm_head_key]
|
| 126 |
+
mean_lm = lm_head.mean(dim=0, keepdim=True)
|
| 127 |
+
noise_lm = torch.randn(extra, hidden_size, dtype=lm_head.dtype) * 0.01
|
| 128 |
+
new_lm = torch.cat([lm_head, mean_lm.expand(extra, -1) + noise_lm], dim=0)
|
| 129 |
+
state_dict[lm_head_key] = new_lm
|
| 130 |
+
print(f" lm_head resized: {lm_head.shape} -> {new_lm.shape}")
|
| 131 |
+
|
| 132 |
+
print(f" Saving to {output_path} ...")
|
| 133 |
+
save_file(state_dict, str(output_path))
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def main():
|
| 137 |
+
parser = argparse.ArgumentParser(description="Fix tokenizer byte-fallback for GGUF")
|
| 138 |
+
parser.add_argument("--input", type=Path, required=True, help="Input HF checkpoint dir")
|
| 139 |
+
parser.add_argument("--output", type=Path, required=True, help="Output fixed HF checkpoint dir")
|
| 140 |
+
parser.add_argument("--sp_model", type=Path, default=None,
|
| 141 |
+
help="SentencePiece .model file to copy (for GGUF conversion)")
|
| 142 |
+
args = parser.parse_args()
|
| 143 |
+
|
| 144 |
+
input_dir = args.input
|
| 145 |
+
output_dir = args.output
|
| 146 |
+
|
| 147 |
+
if not input_dir.exists():
|
| 148 |
+
print(f"ERROR: Input directory not found: {input_dir}")
|
| 149 |
+
return 1
|
| 150 |
+
|
| 151 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 152 |
+
|
| 153 |
+
# Load config to check tie_word_embeddings
|
| 154 |
+
with open(input_dir / "config.json") as f:
|
| 155 |
+
config = json.load(f)
|
| 156 |
+
old_vocab = config["vocab_size"]
|
| 157 |
+
new_vocab = old_vocab + BYTE_FALLBACK_COUNT
|
| 158 |
+
tie_embeddings = config.get("tie_word_embeddings", False)
|
| 159 |
+
|
| 160 |
+
print(f"=== Byte-Fallback Fix ===")
|
| 161 |
+
print(f"Input: {input_dir}")
|
| 162 |
+
print(f"Output: {output_dir}")
|
| 163 |
+
print(f"Old vocab: {old_vocab}, New vocab: {new_vocab}")
|
| 164 |
+
print(f"tie_word_embeddings: {tie_embeddings}")
|
| 165 |
+
print()
|
| 166 |
+
|
| 167 |
+
# 1. Fix tokenizer.json
|
| 168 |
+
print("[1/4] Fixing tokenizer.json ...")
|
| 169 |
+
fix_tokenizer_json(
|
| 170 |
+
input_dir / "tokenizer.json",
|
| 171 |
+
output_dir / "tokenizer.json",
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# 2. Fix config.json
|
| 175 |
+
print("[2/4] Fixing config.json ...")
|
| 176 |
+
fix_config_json(
|
| 177 |
+
input_dir / "config.json",
|
| 178 |
+
output_dir / "config.json",
|
| 179 |
+
new_vocab,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# 3. Resize model weights
|
| 183 |
+
print("[3/4] Resizing embeddings ...")
|
| 184 |
+
resize_embeddings(
|
| 185 |
+
input_dir / "model.safetensors",
|
| 186 |
+
output_dir / "model.safetensors",
|
| 187 |
+
old_vocab, new_vocab, tie_embeddings,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# 4. Copy other files
|
| 191 |
+
print("[4/4] Copying remaining files ...")
|
| 192 |
+
for fname in ["tokenizer_config.json", "generation_config.json"]:
|
| 193 |
+
src = input_dir / fname
|
| 194 |
+
if src.exists():
|
| 195 |
+
shutil.copy2(src, output_dir / fname)
|
| 196 |
+
print(f" Copied {fname}")
|
| 197 |
+
|
| 198 |
+
# Copy SentencePiece model if provided (needed for GGUF conversion)
|
| 199 |
+
if args.sp_model and args.sp_model.exists():
|
| 200 |
+
shutil.copy2(args.sp_model, output_dir / "tokenizer.model")
|
| 201 |
+
print(f" Copied tokenizer.model from {args.sp_model}")
|
| 202 |
+
elif (input_dir / "tokenizer.model").exists():
|
| 203 |
+
shutil.copy2(input_dir / "tokenizer.model", output_dir / "tokenizer.model")
|
| 204 |
+
print(f" Copied tokenizer.model from input dir")
|
| 205 |
+
|
| 206 |
+
# Update tokenizer_config.json to add added_tokens_decoder for byte tokens
|
| 207 |
+
tc_path = output_dir / "tokenizer_config.json"
|
| 208 |
+
if tc_path.exists():
|
| 209 |
+
with open(tc_path) as f:
|
| 210 |
+
tc = json.load(f)
|
| 211 |
+
added_tokens_decoder = tc.get("added_tokens_decoder", {})
|
| 212 |
+
for i in range(BYTE_FALLBACK_COUNT):
|
| 213 |
+
token_id = old_vocab + i
|
| 214 |
+
byte_token = BYTE_TOKEN_TEMPLATE.format(i)
|
| 215 |
+
added_tokens_decoder[str(token_id)] = {
|
| 216 |
+
"content": byte_token,
|
| 217 |
+
"lstrip": False,
|
| 218 |
+
"normalized": False,
|
| 219 |
+
"rstrip": False,
|
| 220 |
+
"single_word": False,
|
| 221 |
+
"special": True,
|
| 222 |
+
}
|
| 223 |
+
tc["added_tokens_decoder"] = added_tokens_decoder
|
| 224 |
+
with open(tc_path, "w") as f:
|
| 225 |
+
json.dump(tc, f, indent=2)
|
| 226 |
+
print(f" Updated tokenizer_config.json with {BYTE_FALLBACK_COUNT} byte tokens")
|
| 227 |
+
|
| 228 |
+
print()
|
| 229 |
+
print(f"=== Done! Fixed checkpoint at: {output_dir} ===")
|
| 230 |
+
print(f"Next: python outputs/llama.cpp/convert_hf_to_gguf.py {output_dir} --outfile outputs/gguf/frankenstallm-3b-f16.gguf --outtype f16")
|
| 231 |
+
return 0
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
raise SystemExit(main())
|
source/scripts/hourly_status.sh
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# hourly_status.sh — FRANKENSTALLM 3B Hourly Training Status Report (Telegram)
|
| 4 |
+
# Run: every hour via cron
|
| 5 |
+
# Sends a rich formatted message with progress, loss, ETA, GPU/disk summary.
|
| 6 |
+
# =============================================================================
|
| 7 |
+
set -euo pipefail
|
| 8 |
+
|
| 9 |
+
# ─── Paths ───────────────────────────────────────────────────────────────────
|
| 10 |
+
WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
|
| 11 |
+
CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
|
| 12 |
+
LOG_FILE="$CKPT_DIR/train.log"
|
| 13 |
+
PID_FILE="$CKPT_DIR/train.pid"
|
| 14 |
+
HOURLY_LOG="$CKPT_DIR/hourly_status.log"
|
| 15 |
+
NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"
|
| 16 |
+
|
| 17 |
+
TOTAL_STEPS=57000
|
| 18 |
+
TOTAL_TOKENS_B=114 # billion tokens target (57K steps × batch)
|
| 19 |
+
|
| 20 |
+
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
| 21 |
+
ts() { date '+%Y-%m-%d %H:%M:%S'; }
|
| 22 |
+
log() { echo "[$(ts)] $*"; }
|
| 23 |
+
|
| 24 |
+
# Safely get last matching value from log
|
| 25 |
+
parse_last() {
|
| 26 |
+
local pattern="$1"
|
| 27 |
+
grep -oP "$pattern" "$LOG_FILE" 2>/dev/null | tail -1 || echo ""
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
# ─── Parse training log ───────────────────────────────────────────────────────
|
| 31 |
+
parse_log() {
|
| 32 |
+
if [[ ! -f "$LOG_FILE" ]]; then
|
| 33 |
+
echo "NO_LOG"
|
| 34 |
+
return 1
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
# Get the last step line
|
| 38 |
+
LAST_LINE=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1 || echo "")
|
| 39 |
+
if [[ -z "$LAST_LINE" ]]; then
|
| 40 |
+
echo "NO_STEPS"
|
| 41 |
+
return 1
|
| 42 |
+
fi
|
| 43 |
+
|
| 44 |
+
CURRENT_STEP=$(echo "$LAST_LINE" | grep -oP 'step\s+\K[0-9]+' || echo "0")
|
| 45 |
+
CURRENT_LOSS=$(echo "$LAST_LINE" | grep -oP 'loss\s+\K[0-9.]+' || echo "N/A")
|
| 46 |
+
CURRENT_LR=$(echo "$LAST_LINE" | grep -oP 'lr\s+\K[0-9.e+-]+' || echo "N/A")
|
| 47 |
+
CURRENT_GNORM=$(echo "$LAST_LINE" | grep -oP 'gnorm\s+\K[0-9.]+' || echo "N/A")
|
| 48 |
+
CURRENT_TOKPS=$(echo "$LAST_LINE" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "0")
|
| 49 |
+
CURRENT_MEM=$(echo "$LAST_LINE" | grep -oP 'mem\s+\K[0-9.]+GB' || echo "N/A")
|
| 50 |
+
CURRENT_EPOCH=$(echo "$LAST_LINE" | grep -oP 'epoch\s+\K[0-9]+' || echo "0")
|
| 51 |
+
|
| 52 |
+
# Log timestamp — parse from the line itself
|
| 53 |
+
LOG_TS=$(echo "$LAST_LINE" | grep -oP '\[\K[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' || echo "unknown")
|
| 54 |
+
|
| 55 |
+
return 0
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# ─── Calculate progress & ETA ─────────────────────────────────────────────────
|
| 59 |
+
compute_eta() {
|
| 60 |
+
local step="$1"
|
| 61 |
+
local tokps="$2"
|
| 62 |
+
|
| 63 |
+
# Progress
|
| 64 |
+
PROGRESS_PCT=$(echo "scale=1; $step * 100 / $TOTAL_STEPS" | bc -l 2>/dev/null || echo "0")
|
| 65 |
+
|
| 66 |
+
# Steps remaining
|
| 67 |
+
STEPS_LEFT=$(( TOTAL_STEPS - step ))
|
| 68 |
+
|
| 69 |
+
# Tokens processed so far (approx: step × 2M tokens/step for 3B, bs=4, seqlen=4096, 8gpu)
|
| 70 |
+
# bs=4, accum=8, 8gpu → effective batch = 4*8*8=256 sequences × 4096 tokens = 1,048,576 ≈ 1M tok/step
|
| 71 |
+
TOKENS_PROCESSED_B=$(echo "scale=2; $step * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
|
| 72 |
+
|
| 73 |
+
# ETA using current tok/s
|
| 74 |
+
if [[ "$tokps" -gt 0 ]]; then
|
| 75 |
+
# tokens remaining
|
| 76 |
+
local tokens_left_b
|
| 77 |
+
tokens_left_b=$(echo "scale=2; ($TOTAL_STEPS - $step) * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
|
| 78 |
+
local tokens_left
|
| 79 |
+
tokens_left=$(echo "scale=0; ($TOTAL_STEPS - $step) * 1048576" | bc -l 2>/dev/null || echo "0")
|
| 80 |
+
local secs_left
|
| 81 |
+
secs_left=$(echo "scale=0; $tokens_left / $tokps" | bc -l 2>/dev/null || echo "0")
|
| 82 |
+
|
| 83 |
+
ETA_HOURS=$(echo "scale=1; $secs_left / 3600" | bc -l 2>/dev/null || echo "N/A")
|
| 84 |
+
if [[ "$ETA_HOURS" != "N/A" ]]; then
|
| 85 |
+
local eta_epoch
|
| 86 |
+
eta_epoch=$(( $(date +%s) + secs_left ))
|
| 87 |
+
ETA_DATETIME=$(date -d "@$eta_epoch" '+%m/%d %H:%M' 2>/dev/null || echo "N/A")
|
| 88 |
+
else
|
| 89 |
+
ETA_DATETIME="N/A"
|
| 90 |
+
fi
|
| 91 |
+
else
|
| 92 |
+
ETA_HOURS="N/A"
|
| 93 |
+
ETA_DATETIME="N/A"
|
| 94 |
+
fi
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# ─── GPU summary ─────────────────────────────────────────────────────────────
|
| 98 |
+
get_gpu_summary() {
|
| 99 |
+
if ! command -v nvidia-smi &>/dev/null; then
|
| 100 |
+
GPU_SUMMARY="nvidia-smi not available"
|
| 101 |
+
GPU_AVG_UTIL="N/A"
|
| 102 |
+
GPU_TOTAL_MEM="N/A"
|
| 103 |
+
return
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
local raw
|
| 107 |
+
raw=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
|
| 108 |
+
--format=csv,noheader,nounits 2>/dev/null || echo "")
|
| 109 |
+
|
| 110 |
+
if [[ -z "$raw" ]]; then
|
| 111 |
+
GPU_SUMMARY="GPU query failed"
|
| 112 |
+
GPU_AVG_UTIL="N/A"
|
| 113 |
+
GPU_TOTAL_MEM="N/A"
|
| 114 |
+
return
|
| 115 |
+
fi
|
| 116 |
+
|
| 117 |
+
# avg util
|
| 118 |
+
GPU_AVG_UTIL=$(echo "$raw" | awk -F', ' '{sum+=$2; count++} END {printf "%.0f%%", sum/count}')
|
| 119 |
+
|
| 120 |
+
# total mem used / total
|
| 121 |
+
GPU_TOTAL_MEM=$(echo "$raw" | awk -F', ' \
|
| 122 |
+
'{used+=$3; total+=$4} END {printf "%.1f / %.1f GiB", used/1024, total/1024}')
|
| 123 |
+
|
| 124 |
+
# Per-GPU one-liner: "G0:95% 48G | G1:94% 48G | ..."
|
| 125 |
+
GPU_SUMMARY=$(echo "$raw" | awk -F', ' \
|
| 126 |
+
'{printf "G%s:%s%% %sMiB | ", $1, $2, $3}' | sed 's/ | $//')
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
# ─── Disk usage ──────────────────────────────────────────────────────────────
|
| 130 |
+
get_disk_info() {
|
| 131 |
+
DISK_INFO=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {printf "%s used / %s total (%s)", $3, $2, $5}' || echo "N/A")
|
| 132 |
+
CKPT_COUNT=$(ls -d "$CKPT_DIR"/checkpoint-* 2>/dev/null | wc -l || echo "0")
|
| 133 |
+
LAST_CKPT=$(ls -dt "$CKPT_DIR"/checkpoint-* 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo "none")
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# ─── Process status ───────────────────────────────────────────────────────────
|
| 137 |
+
get_process_status() {
|
| 138 |
+
PROC_STATUS="UNKNOWN"
|
| 139 |
+
if [[ -f "$PID_FILE" ]]; then
|
| 140 |
+
local pid
|
| 141 |
+
pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
|
| 142 |
+
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
|
| 143 |
+
PROC_STATUS="RUNNING (PID $pid)"
|
| 144 |
+
else
|
| 145 |
+
PROC_STATUS="STOPPED (PID $pid)"
|
| 146 |
+
fi
|
| 147 |
+
else
|
| 148 |
+
PROC_STATUS="NO PID FILE"
|
| 149 |
+
fi
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
# ─── Build & send message ────────────────────────────────────────────────────
|
| 153 |
+
build_and_send() {
|
| 154 |
+
local step="$CURRENT_STEP"
|
| 155 |
+
local loss="$CURRENT_LOSS"
|
| 156 |
+
local tokps="$CURRENT_TOKPS"
|
| 157 |
+
|
| 158 |
+
# Status icon
|
| 159 |
+
local status_icon
|
| 160 |
+
if [[ "$PROC_STATUS" == RUNNING* ]]; then
|
| 161 |
+
status_icon="✅" # green check
|
| 162 |
+
else
|
| 163 |
+
status_icon="❌" # red X
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
# Progress bar (20 chars)
|
| 167 |
+
local bar_filled=$(echo "scale=0; $PROGRESS_PCT * 20 / 100" | bc -l 2>/dev/null || echo "0")
|
| 168 |
+
local bar_empty=$(( 20 - bar_filled ))
|
| 169 |
+
PROGRESS_BAR=$(printf '%0.s█' $(seq 1 $bar_filled 2>/dev/null) ; printf '%0.s░' $(seq 1 $bar_empty 2>/dev/null)) || PROGRESS_BAR="[$PROGRESS_PCT%]"
|
| 170 |
+
|
| 171 |
+
local msg
|
| 172 |
+
msg="$(cat <<EOF
|
| 173 |
+
<b>FRANKENSTALLM 3B — Hourly Status</b>
|
| 174 |
+
<i>$(ts)</i>
|
| 175 |
+
|
| 176 |
+
$status_icon <b>Process:</b> $PROC_STATUS
|
| 177 |
+
|
| 178 |
+
<b>Progress</b>
|
| 179 |
+
Step: <code>$step / $TOTAL_STEPS</code> ($PROGRESS_PCT%)
|
| 180 |
+
Tokens: <code>${TOKENS_PROCESSED_B}B / ${TOTAL_TOKENS_B}B</code>
|
| 181 |
+
Epoch: <code>$CURRENT_EPOCH</code>
|
| 182 |
+
Last log: <code>$LOG_TS</code>
|
| 183 |
+
|
| 184 |
+
<b>Training Metrics</b>
|
| 185 |
+
Loss: <code>$loss</code>
|
| 186 |
+
LR: <code>$CURRENT_LR</code>
|
| 187 |
+
Gnorm: <code>$CURRENT_GNORM</code>
|
| 188 |
+
Tok/s: <code>$tokps</code>
|
| 189 |
+
Mem: <code>$CURRENT_MEM</code>
|
| 190 |
+
|
| 191 |
+
<b>ETA</b>
|
| 192 |
+
Steps left: <code>$STEPS_LEFT</code>
|
| 193 |
+
Remaining: <code>~$ETA_HOURS h</code>
|
| 194 |
+
Est. done: <code>$ETA_DATETIME</code>
|
| 195 |
+
|
| 196 |
+
<b>GPU</b>
|
| 197 |
+
Avg util: <code>$GPU_AVG_UTIL</code>
|
| 198 |
+
Total mem: <code>$GPU_TOTAL_MEM</code>
|
| 199 |
+
|
| 200 |
+
<b>Checkpoints</b>
|
| 201 |
+
Last saved: <code>$LAST_CKPT</code>
|
| 202 |
+
Total: <code>$CKPT_COUNT</code> checkpoints
|
| 203 |
+
|
| 204 |
+
<b>Disk</b>
|
| 205 |
+
<code>$DISK_INFO</code>
|
| 206 |
+
EOF
|
| 207 |
+
)"
|
| 208 |
+
|
| 209 |
+
log "Sending hourly status report (step $step)..."
|
| 210 |
+
$NOTIFY "$msg" || {
|
| 211 |
+
log "ERROR: Failed to send Telegram message."
|
| 212 |
+
return 1
|
| 213 |
+
}
|
| 214 |
+
log "Status report sent."
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
# ─── Main ────────────────────────────────────────────────────────────────────
|
| 218 |
+
main() {
|
| 219 |
+
log "=== Hourly status START ==="
|
| 220 |
+
|
| 221 |
+
parse_log || {
|
| 222 |
+
log "Cannot parse log — sending minimal status."
|
| 223 |
+
$NOTIFY "<b>FRANKENSTALLM 3B</b> — Status check at $(ts)
|
| 224 |
+
|
| 225 |
+
<b>WARNING:</b> Cannot read training log at:
|
| 226 |
+
<code>$LOG_FILE</code>
|
| 227 |
+
|
| 228 |
+
Process status: $(cat "$PID_FILE" 2>/dev/null && echo "(PID found)" || echo "(no PID file)")" || true
|
| 229 |
+
return 0
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
compute_eta "$CURRENT_STEP" "$CURRENT_TOKPS"
|
| 233 |
+
get_gpu_summary
|
| 234 |
+
get_disk_info
|
| 235 |
+
get_process_status
|
| 236 |
+
build_and_send
|
| 237 |
+
|
| 238 |
+
log "=== Hourly status END ==="
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
main "$@"
|
source/scripts/launch_3b_orpo.sh
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_3b_orpo.sh — 8-GPU ORPO fine-tuning launcher for Korean 3B LLM
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/launch_3b_orpo.sh # 기본 실행
|
| 7 |
+
# bash scripts/launch_3b_orpo.sh --max_steps 200 # 빠른 테스트
|
| 8 |
+
# RUN_NAME=my_orpo bash scripts/launch_3b_orpo.sh # 이름 지정
|
| 9 |
+
#
|
| 10 |
+
# 기반 모델 : eval/outputs/hf_3b_sft_best (SFT v1 best)
|
| 11 |
+
# 데이터 : data/preference/combined_preference.jsonl
|
| 12 |
+
# 출력 : checkpoints/korean_3b_orpo_v1/
|
| 13 |
+
# 로그 : checkpoints/korean_3b_orpo_v1/train.log
|
| 14 |
+
#
|
| 15 |
+
# 체크포인트 크기 예상:
|
| 16 |
+
# model weights: ~6GB (bf16)
|
| 17 |
+
# optimizer states: ~24GB
|
| 18 |
+
# 총 ~30GB/개 × max 5개 = 150GB
|
| 19 |
+
# =============================================================================
|
| 20 |
+
set -euo pipefail
|
| 21 |
+
|
| 22 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 23 |
+
RUN_NAME="${RUN_NAME:-korean_3b_orpo_v1}"
|
| 24 |
+
BASE_MODEL="${BASE_MODEL:-eval/outputs/hf_3b_sft_best}"
|
| 25 |
+
DATA_PATH="${DATA_PATH:-data/preference/combined_preference.jsonl}"
|
| 26 |
+
OUTPUT_DIR="checkpoints/${RUN_NAME}"
|
| 27 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 28 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 29 |
+
NPROC=8
|
| 30 |
+
MASTER_PORT="${MASTER_PORT:-29502}"
|
| 31 |
+
|
| 32 |
+
# ORPO 하이퍼파라미터
|
| 33 |
+
BATCH_SIZE=4
|
| 34 |
+
GRAD_ACCUM=4
|
| 35 |
+
LR=1.2e-5
|
| 36 |
+
BETA=0.25
|
| 37 |
+
EPOCHS=2
|
| 38 |
+
MAX_LENGTH=1536
|
| 39 |
+
WARMUP_RATIO=0.05
|
| 40 |
+
WEIGHT_DECAY=0.01
|
| 41 |
+
EVAL_SPLIT_RATIO=0.05
|
| 42 |
+
EVAL_STEPS=500
|
| 43 |
+
EARLY_STOPPING_PATIENCE=3
|
| 44 |
+
SAVE_TOTAL_LIMIT=5
|
| 45 |
+
SEED=42
|
| 46 |
+
|
| 47 |
+
EXTRA_ARGS="$@"
|
| 48 |
+
|
| 49 |
+
# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
|
| 50 |
+
# (launch_3b_pretrain.sh와 동일한 NCCL 설정 유지)
|
| 51 |
+
export NCCL_IB_DISABLE=1
|
| 52 |
+
export NCCL_PROTO=Simple
|
| 53 |
+
export NCCL_MIN_NCHANNELS=16
|
| 54 |
+
export NCCL_MAX_NCHANNELS=16
|
| 55 |
+
# ORPO forward-backward 패스는 pretrain보다 메모리 변동이 크므로 버퍼 128MB 유지
|
| 56 |
+
export NCCL_BUFFSIZE=134217728
|
| 57 |
+
export OMP_NUM_THREADS=9
|
| 58 |
+
export MKL_NUM_THREADS=9
|
| 59 |
+
# OOM 방지: 메모리 단편화 완화 (ORPO는 chosen/rejected 동시 forward → 메모리 민감)
|
| 60 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 61 |
+
# P2P NVLink 직접 통신 활성화
|
| 62 |
+
export NCCL_P2P_LEVEL=NVL
|
| 63 |
+
# Ring + Tree 병행 (3B gradient 크기 기준)
|
| 64 |
+
export NCCL_ALGO=Ring,Tree
|
| 65 |
+
|
| 66 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 67 |
+
|
| 68 |
+
cd "$(dirname "$0")/.."
|
| 69 |
+
|
| 70 |
+
# ---- Pre-flight checks ------------------------------------------------------
|
| 71 |
+
if [[ ! -d "${BASE_MODEL}" ]]; then
|
| 72 |
+
echo "ERROR: 기반 모델 디렉토리 없음: ${BASE_MODEL}"
|
| 73 |
+
echo " SFT 완료 후 HF 포맷으로 변환했는지 확인하세요."
|
| 74 |
+
echo " 예: python scripts/convert_to_hf.py --checkpoint <sft_ckpt> --output ${BASE_MODEL}"
|
| 75 |
+
exit 1
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
if [[ ! -f "${DATA_PATH}" ]]; then
|
| 79 |
+
echo "ERROR: 학습 데이터 없음: ${DATA_PATH}"
|
| 80 |
+
echo " 먼저 데이터 통합 스크립트를 실행하세요:"
|
| 81 |
+
echo " python data/prepare_preference_combined.py"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
|
| 85 |
+
if [[ ! -f "train/orpo.py" ]]; then
|
| 86 |
+
echo "ERROR: train/orpo.py 없음"
|
| 87 |
+
exit 1
|
| 88 |
+
fi
|
| 89 |
+
|
| 90 |
+
# GPU 메모리 체크
|
| 91 |
+
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
|
| 92 |
+
if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 40000 ]]; then
|
| 93 |
+
echo "WARNING: GPU 메모리 ${GPU_MEM}MB < 40GB. ORPO 3B 학습에 부족할 수 있음."
|
| 94 |
+
fi
|
| 95 |
+
|
| 96 |
+
# 중복 프로세스 방지
|
| 97 |
+
EXISTING_PID=$(pgrep -f "orpo.py.*${RUN_NAME}" 2>/dev/null | head -1 || true)
|
| 98 |
+
if [[ -n "$EXISTING_PID" ]]; then
|
| 99 |
+
echo "ERROR: 이미 ORPO 프로세스 실행 중 (PID: ${EXISTING_PID})"
|
| 100 |
+
echo " kill ${EXISTING_PID} 로 먼저 종료하세요."
|
| 101 |
+
exit 1
|
| 102 |
+
fi
|
| 103 |
+
|
| 104 |
+
# 디스크 여유 확인 (최소 200GB)
|
| 105 |
+
AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}' || echo "0")
|
| 106 |
+
if [[ -n "$AVAIL_KB" && "$AVAIL_KB" -gt 0 && "$AVAIL_KB" -lt 209715200 ]]; then
|
| 107 |
+
AVAIL_GB=$(echo "scale=1; $AVAIL_KB / 1048576" | bc 2>/dev/null || echo "?")
|
| 108 |
+
echo "WARNING: /PROJECT 여유 ${AVAIL_GB}GB < 200GB. 체크포인트 저장 공간 부족 가능."
|
| 109 |
+
fi
|
| 110 |
+
|
| 111 |
+
mkdir -p "${CKPT_DIR}" "${OUTPUT_DIR}"
|
| 112 |
+
|
| 113 |
+
# ---- 데이터 레코드 수 확인 --------------------------------------------------
|
| 114 |
+
DATA_LINES=$(wc -l < "${DATA_PATH}" 2>/dev/null || echo "?")
|
| 115 |
+
echo " 학습 데이터 레코드 수: ${DATA_LINES}"
|
| 116 |
+
|
| 117 |
+
# ---- 유효 배치 크기 계산 ----------------------------------------------------
|
| 118 |
+
EFF_BATCH=$((BATCH_SIZE * NPROC * GRAD_ACCUM))
|
| 119 |
+
|
| 120 |
+
echo "=================================================================="
|
| 121 |
+
echo " Korean 3B LLM ORPO Fine-Tuning"
|
| 122 |
+
echo " Run name : ${RUN_NAME}"
|
| 123 |
+
echo " Base model : ${BASE_MODEL}"
|
| 124 |
+
echo " Data : ${DATA_PATH} (${DATA_LINES} records)"
|
| 125 |
+
echo " Output dir : ${OUTPUT_DIR}"
|
| 126 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 127 |
+
echo " Log file : ${LOG_FILE}"
|
| 128 |
+
echo " Epochs : ${EPOCHS}"
|
| 129 |
+
echo " LR : ${LR}"
|
| 130 |
+
echo " Beta (ORPO) : ${BETA}"
|
| 131 |
+
echo " Batch : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} accum = ${EFF_BATCH}"
|
| 132 |
+
echo " Max length : ${MAX_LENGTH}"
|
| 133 |
+
echo " Weight decay : ${WEIGHT_DECAY}"
|
| 134 |
+
echo " Eval steps : ${EVAL_STEPS}"
|
| 135 |
+
echo " Early stop : patience=${EARLY_STOPPING_PATIENCE}"
|
| 136 |
+
echo " Started : $(date)"
|
| 137 |
+
echo "=================================================================="
|
| 138 |
+
|
| 139 |
+
torchrun \
|
| 140 |
+
--nproc_per_node=${NPROC} \
|
| 141 |
+
--master_port=${MASTER_PORT} \
|
| 142 |
+
train/orpo.py \
|
| 143 |
+
--model_path "${BASE_MODEL}" \
|
| 144 |
+
--custom_data_path "${DATA_PATH}" \
|
| 145 |
+
--output_dir "${OUTPUT_DIR}" \
|
| 146 |
+
--epochs ${EPOCHS} \
|
| 147 |
+
--lr ${LR} \
|
| 148 |
+
--beta ${BETA} \
|
| 149 |
+
--batch_size ${BATCH_SIZE} \
|
| 150 |
+
--gradient_accumulation_steps ${GRAD_ACCUM} \
|
| 151 |
+
--max_length ${MAX_LENGTH} \
|
| 152 |
+
--weight_decay ${WEIGHT_DECAY} \
|
| 153 |
+
--eval_split_ratio ${EVAL_SPLIT_RATIO} \
|
| 154 |
+
--eval_steps ${EVAL_STEPS} \
|
| 155 |
+
--early_stopping_patience ${EARLY_STOPPING_PATIENCE} \
|
| 156 |
+
--save_total_limit ${SAVE_TOTAL_LIMIT} \
|
| 157 |
+
${EXTRA_ARGS} \
|
| 158 |
+
2>&1 | tee "${LOG_FILE}" \
|
| 159 |
+
| grep -v "UserWarning" \
|
| 160 |
+
| grep -v "Warning only once" \
|
| 161 |
+
| grep -v "Overriding a previously" \
|
| 162 |
+
| grep -v "dispatch key:" \
|
| 163 |
+
| grep -v "previous kernel:" \
|
| 164 |
+
| grep -v "new kernel:" \
|
| 165 |
+
| grep -v "operator: flash_attn" \
|
| 166 |
+
| grep -v "registered at /usr/local" \
|
| 167 |
+
| grep -v "self.m.impl"
|
| 168 |
+
|
| 169 |
+
EXIT_CODE=$?
|
| 170 |
+
echo "=================================================================="
|
| 171 |
+
echo " Done : $(date)"
|
| 172 |
+
echo " Exit code: ${EXIT_CODE}"
|
| 173 |
+
if [[ "${EXIT_CODE}" -eq 0 ]]; then
|
| 174 |
+
echo " 모델 저장 위치: ${OUTPUT_DIR}"
|
| 175 |
+
fi
|
| 176 |
+
echo "=================================================================="
|
| 177 |
+
exit $EXIT_CODE
|
source/scripts/launch_3b_pretrain.sh
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_3b_pretrain.sh — 8-GPU FP8 pretraining launcher for Korean 3B LLM
|
| 4 |
+
#
|
| 5 |
+
# Features:
|
| 6 |
+
# - SIGHUP 방어: SSH 끊김 시 자동으로 nohup+setsid로 세션 보호
|
| 7 |
+
# - Graceful shutdown: SIGTERM 시 Python 시그널 핸들러가 비상 체크포인트 저장
|
| 8 |
+
# - 자동 resume: 최신 체크포인트에서 자동 재개
|
| 9 |
+
# - PID 파일: 프로세스 모니터링 및 제어용
|
| 10 |
+
# - grep 파이프라인 exit code 보호 (|| true)
|
| 11 |
+
#
|
| 12 |
+
# Usage:
|
| 13 |
+
# bash scripts/launch_3b_pretrain.sh # full run (60B tokens)
|
| 14 |
+
# bash scripts/launch_3b_pretrain.sh --max_steps 500 # quick test
|
| 15 |
+
# bash scripts/launch_3b_pretrain.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-0010000
|
| 16 |
+
# MAX_STEPS=95000 bash scripts/launch_3b_pretrain.sh # 100B tokens
|
| 17 |
+
#
|
| 18 |
+
# 모니터링:
|
| 19 |
+
# tail -f checkpoints/korean_3b_fp8_run1/train.log
|
| 20 |
+
# cat checkpoints/korean_3b_fp8_run1/train.pid
|
| 21 |
+
#
|
| 22 |
+
# 중지 (비상 체크포인트 자동 저장):
|
| 23 |
+
# kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)
|
| 24 |
+
#
|
| 25 |
+
# 강제 종료 (체크포인트 저장 없음):
|
| 26 |
+
# kill -9 $(cat checkpoints/korean_3b_fp8_run1/train.pid)
|
| 27 |
+
# =============================================================================
|
| 28 |
+
|
| 29 |
+
# -u: 미정의 변수 에러
|
| 30 |
+
# NOTE: -e, -o pipefail 의도적 제거
|
| 31 |
+
# 이전 문제: grep 파이프라인에서 모든 라인이 필터링되면 exit code 1 반환
|
| 32 |
+
# → pipefail이 이를 스크립트 실패로 전파 → 학습 중단
|
| 33 |
+
# 해결: set -e/pipefail 제거 + grep 체인에 || true 추가
|
| 34 |
+
set -u
|
| 35 |
+
|
| 36 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 37 |
+
RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
|
| 38 |
+
CONFIG="${CONFIG:-configs/korean_3b_fp8.yaml}"
|
| 39 |
+
TRAIN_DATA="${TRAIN_DATA:-data/3b_train.bin}"
|
| 40 |
+
VAL_DATA="${VAL_DATA:-data/3b_val.bin}"
|
| 41 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 42 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 43 |
+
NPROC=8
|
| 44 |
+
MASTER_PORT="${MASTER_PORT:-29501}"
|
| 45 |
+
|
| 46 |
+
MAX_STEPS="${MAX_STEPS:-57000}"
|
| 47 |
+
BATCH_SIZE=5
|
| 48 |
+
GRAD_ACCUM=8
|
| 49 |
+
WARMUP_STEPS=2000
|
| 50 |
+
SEED=42
|
| 51 |
+
|
| 52 |
+
# ---- B200 / NVSwitch single-node NCCL tuning (3B optimized, v2) ----------
|
| 53 |
+
export NCCL_IB_DISABLE=1
|
| 54 |
+
export NCCL_ALGO=NVLS,Ring # NVSwitch hardware reduction first (was Ring,Tree)
|
| 55 |
+
export NCCL_PROTO=Simple
|
| 56 |
+
export NCCL_NVLS_ENABLE=1 # NVLink SHARP — hardware-accelerated all-reduce
|
| 57 |
+
export NCCL_MIN_NCHANNELS=32 # raise minimum for NVSwitch headroom (was 16)
|
| 58 |
+
export NCCL_MAX_NCHANNELS=32
|
| 59 |
+
export NCCL_BUFFSIZE=268435456 # 256MB (was 128MB) — reduces bucket pipeline stalls
|
| 60 |
+
export NCCL_P2P_LEVEL=NVL
|
| 61 |
+
export NCCL_NET_GDR_LEVEL=0
|
| 62 |
+
export OMP_NUM_THREADS=4
|
| 63 |
+
export MKL_NUM_THREADS=4
|
| 64 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 65 |
+
# Triton/Inductor cache on executable filesystem (not /tmp which is noexec)
|
| 66 |
+
export TRITON_CUDACRT_PATH=/usr/local/cuda/include
|
| 67 |
+
export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
| 68 |
+
|
| 69 |
+
cd "$(dirname "$0")/.."
|
| 70 |
+
|
| 71 |
+
mkdir -p "${CKPT_DIR}"
|
| 72 |
+
|
| 73 |
+
# ---- Session protection (SIGHUP 방어) ---------------------------------------
|
| 74 |
+
# tmux/screen 없이 실행 시, 자동으로 nohup + setsid로 래핑하여
|
| 75 |
+
# SSH 끊김(SIGHUP)으로부터 학습 프로세스를 보호합니다.
|
| 76 |
+
#
|
| 77 |
+
# 작동 원리:
|
| 78 |
+
# 1. tmux/screen/이미 보호됨 여부 확인
|
| 79 |
+
# 2. 미보호 상태이면 _LAUNCH_PROTECTED=1 설정 후 nohup setsid로 자기 자신을 재실행
|
| 80 |
+
# 3. 재실행된 프로세스는 새로운 세션 리더가 되어 터미널과 분리됨
|
| 81 |
+
# 4. 원래 셸은 PID와 모니터링 명령을 출력하고 즉시 종료
|
| 82 |
+
PID_FILE="${CKPT_DIR}/train.pid"
|
| 83 |
+
|
| 84 |
+
if [[ -z "${_LAUNCH_PROTECTED:-}" ]] && [[ -z "${TMUX:-}" ]] && [[ -z "${STY:-}" ]]; then
|
| 85 |
+
export _LAUNCH_PROTECTED=1
|
| 86 |
+
NOHUP_LOG="${CKPT_DIR}/launch_$(date +%Y%m%d_%H%M%S).log"
|
| 87 |
+
|
| 88 |
+
echo "=================================================================="
|
| 89 |
+
echo " SIGHUP PROTECTION ACTIVATED"
|
| 90 |
+
echo " tmux/screen 미감지 → 세션 보호 모드 자동 활성화 (nohup + setsid)"
|
| 91 |
+
echo " SSH 끊어져도 학습이 계속됩니다."
|
| 92 |
+
echo "=================================================================="
|
| 93 |
+
echo ""
|
| 94 |
+
|
| 95 |
+
# 자기 자신을 세션 보호 모드로 재실행
|
| 96 |
+
nohup setsid bash "$0" "$@" > "${NOHUP_LOG}" 2>&1 &
|
| 97 |
+
BG_PID=$!
|
| 98 |
+
echo "${BG_PID}" > "${PID_FILE}"
|
| 99 |
+
|
| 100 |
+
echo " PID : ${BG_PID}"
|
| 101 |
+
echo " PID 파일 : ${PID_FILE}"
|
| 102 |
+
echo " Launch 로그 : ${NOHUP_LOG}"
|
| 103 |
+
echo " 학습 로그 : ${LOG_FILE}"
|
| 104 |
+
echo ""
|
| 105 |
+
echo " 모니터링:"
|
| 106 |
+
echo " tail -f ${LOG_FILE}"
|
| 107 |
+
echo ""
|
| 108 |
+
echo " 중지 (비상 체크포인트 자동 저장):"
|
| 109 |
+
echo " kill \$(cat ${PID_FILE})"
|
| 110 |
+
echo ""
|
| 111 |
+
echo " 강제 종료:"
|
| 112 |
+
echo " kill -9 \$(cat ${PID_FILE})"
|
| 113 |
+
echo "=================================================================="
|
| 114 |
+
exit 0
|
| 115 |
+
fi
|
| 116 |
+
|
| 117 |
+
# ---- Cleanup on exit --------------------------------------------------------
|
| 118 |
+
PREWARM_PID=""
|
| 119 |
+
|
| 120 |
+
cleanup() {
|
| 121 |
+
rm -f "${PID_FILE}" 2>/dev/null || true
|
| 122 |
+
if [[ -n "${PREWARM_PID:-}" ]]; then
|
| 123 |
+
kill "${PREWARM_PID}" 2>/dev/null || true
|
| 124 |
+
fi
|
| 125 |
+
}
|
| 126 |
+
trap cleanup EXIT
|
| 127 |
+
|
| 128 |
+
# PID 파일 기록 (tmux/screen 내에서 실행 시에도 PID 추적 가능)
|
| 129 |
+
echo "$$" > "${PID_FILE}"
|
| 130 |
+
|
| 131 |
+
# ---- Pre-flight checks ------------------------------------------------------
|
| 132 |
+
if [[ ! -f "${CONFIG}" ]]; then
|
| 133 |
+
echo "[ERROR] Config not found: ${CONFIG}"
|
| 134 |
+
exit 1
|
| 135 |
+
fi
|
| 136 |
+
|
| 137 |
+
if [[ ! -f "${TRAIN_DATA}" ]]; then
|
| 138 |
+
echo "[ERROR] Training data not found: ${TRAIN_DATA}"
|
| 139 |
+
exit 1
|
| 140 |
+
fi
|
| 141 |
+
|
| 142 |
+
# GPU 메모리 체크 (3B는 최소 80GB/GPU 권장, B200=192GB → OK)
|
| 143 |
+
GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
|
| 144 |
+
if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 80000 ]]; then
|
| 145 |
+
echo "[WARN] GPU memory ${GPU_MEM}MB < 80GB. 3B 학습에 부족할 수 있음."
|
| 146 |
+
fi
|
| 147 |
+
|
| 148 |
+
# 중복 프로세스 방지
|
| 149 |
+
EXISTING_PID=$(pgrep -f "pretrain.py.*korean_3b" 2>/dev/null | head -1 || true)
|
| 150 |
+
if [[ -n "$EXISTING_PID" ]]; then
|
| 151 |
+
echo "[ERROR] 이미 3B pretrain 프로세스 실행 중 (PID: ${EXISTING_PID})"
|
| 152 |
+
echo " kill ${EXISTING_PID} 로 먼저 종료하세요."
|
| 153 |
+
exit 1
|
| 154 |
+
fi
|
| 155 |
+
|
| 156 |
+
# 디스크 여유 확인 (최소 1TB 필요)
|
| 157 |
+
AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}')
|
| 158 |
+
if [[ -n "${AVAIL_KB:-}" ]] && [[ "$AVAIL_KB" -lt 1073741824 ]]; then
|
| 159 |
+
AVAIL_TB=$(echo "scale=1; $AVAIL_KB / 1073741824" | bc 2>/dev/null || echo "?")
|
| 160 |
+
echo "[WARN] /PROJECT 여유 ${AVAIL_TB}TB < 1TB. 체크포인트 저장 공간 부족 가능."
|
| 161 |
+
fi
|
| 162 |
+
|
| 163 |
+
# ---- Resume detection -------------------------------------------------------
|
| 164 |
+
RESUME_ARG=""
|
| 165 |
+
EXTRA_ARGS="${*:-}"
|
| 166 |
+
if [[ ! "${EXTRA_ARGS}" =~ "--resume" ]]; then
|
| 167 |
+
# 가장 최근 체크포인트 자동 감지
|
| 168 |
+
LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
|
| 169 |
+
if [[ -n "$LATEST_CKPT" ]]; then
|
| 170 |
+
echo "[INFO] 자동 resume 감지: ${LATEST_CKPT}"
|
| 171 |
+
RESUME_ARG="--resume ${LATEST_CKPT}"
|
| 172 |
+
fi
|
| 173 |
+
fi
|
| 174 |
+
|
| 175 |
+
# ---- Banner ------------------------------------------------------------------
|
| 176 |
+
SESSION_TYPE="direct"
|
| 177 |
+
[[ -n "${TMUX:-}" ]] && SESSION_TYPE="tmux"
|
| 178 |
+
[[ -n "${STY:-}" ]] && SESSION_TYPE="screen"
|
| 179 |
+
[[ -n "${_LAUNCH_PROTECTED:-}" ]] && SESSION_TYPE="protected (nohup+setsid)"
|
| 180 |
+
|
| 181 |
+
echo "=================================================================="
|
| 182 |
+
echo " Korean 3B LLM Pre-Training (FP8)"
|
| 183 |
+
echo " Run name : ${RUN_NAME}"
|
| 184 |
+
echo " Config : ${CONFIG}"
|
| 185 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 186 |
+
echo " Log file : ${LOG_FILE}"
|
| 187 |
+
echo " Max steps : ${MAX_STEPS}"
|
| 188 |
+
echo " Batch : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} accum"
|
| 189 |
+
echo " Eff tokens : $((BATCH_SIZE * NPROC * GRAD_ACCUM * 4096)) tokens/step (~1M)"
|
| 190 |
+
echo " Total tokens: ~$((MAX_STEPS * BATCH_SIZE * NPROC * GRAD_ACCUM * 4096 / 1000000000))B"
|
| 191 |
+
echo " Resume : ${RESUME_ARG:-none (fresh start)}"
|
| 192 |
+
echo " Session : ${SESSION_TYPE}"
|
| 193 |
+
echo " PID : $$ (file: ${PID_FILE})"
|
| 194 |
+
echo " Started : $(date)"
|
| 195 |
+
echo "=================================================================="
|
| 196 |
+
|
| 197 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 198 |
+
|
| 199 |
+
# ---- Pre-warm OS page cache (NUMA-interleaved, non-blocking) ---------------
|
| 200 |
+
if [[ -f "${TRAIN_DATA}" ]]; then
|
| 201 |
+
echo "[INFO] Pre-warming page cache for ${TRAIN_DATA} (NUMA interleaved)..."
|
| 202 |
+
numactl --interleave=all dd if="${TRAIN_DATA}" of=/dev/null bs=16M 2>/dev/null &
|
| 203 |
+
PREWARM_PID=$!
|
| 204 |
+
fi
|
| 205 |
+
|
| 206 |
+
# ---- Launch training ---------------------------------------------------------
|
| 207 |
+
# grep 파이프라인 보호:
|
| 208 |
+
# 문제: grep -v 가 매칭 라인이 없으면 exit code 1 반환
|
| 209 |
+
# 해결: { ... || true; } 래핑으로 파이프라인 exit code를 항상 0으로 보장
|
| 210 |
+
# torchrun의 실제 exit code는 PIPESTATUS[0]으로 별도 캡처
|
| 211 |
+
numactl --interleave=all \
|
| 212 |
+
torchrun \
|
| 213 |
+
--nproc_per_node=${NPROC} \
|
| 214 |
+
--master_port=${MASTER_PORT} \
|
| 215 |
+
train/pretrain.py \
|
| 216 |
+
--config "${CONFIG}" \
|
| 217 |
+
--train_data "${TRAIN_DATA}" \
|
| 218 |
+
--val_data "${VAL_DATA}" \
|
| 219 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 220 |
+
--log_file "${LOG_FILE}" \
|
| 221 |
+
--max_steps ${MAX_STEPS} \
|
| 222 |
+
--batch_size ${BATCH_SIZE} \
|
| 223 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 224 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 225 |
+
--seed ${SEED} \
|
| 226 |
+
${RESUME_ARG} \
|
| 227 |
+
${EXTRA_ARGS} \
|
| 228 |
+
2>&1 | { grep -v "UserWarning" \
|
| 229 |
+
| grep -v "Warning only once" \
|
| 230 |
+
| grep -v "Overriding a previously" \
|
| 231 |
+
| grep -v "dispatch key:" \
|
| 232 |
+
| grep -v "previous kernel:" \
|
| 233 |
+
| grep -v "new kernel:" \
|
| 234 |
+
| grep -v "operator: flash_attn" \
|
| 235 |
+
| grep -v "registered at /usr/local" \
|
| 236 |
+
| grep -v "self.m.impl" \
|
| 237 |
+
|| true; }
|
| 238 |
+
|
| 239 |
+
EXIT_CODE=${PIPESTATUS[0]}
|
| 240 |
+
|
| 241 |
+
# ---- Exit summary ------------------------------------------------------------
|
| 242 |
+
echo ""
|
| 243 |
+
echo "=================================================================="
|
| 244 |
+
echo " Finished : $(date)"
|
| 245 |
+
echo " Exit code : ${EXIT_CODE}"
|
| 246 |
+
if [[ ${EXIT_CODE} -eq 0 ]]; then
|
| 247 |
+
echo " Status : SUCCESS (학습 완료 또는 graceful shutdown)"
|
| 248 |
+
elif [[ ${EXIT_CODE} -eq 143 ]]; then
|
| 249 |
+
echo " Status : TERMINATED (SIGTERM — 비상 체크포인트 저장됨)"
|
| 250 |
+
elif [[ ${EXIT_CODE} -eq 137 ]]; then
|
| 251 |
+
echo " Status : KILLED (SIGKILL — 강제 종료, 체크포인트 미저장)"
|
| 252 |
+
elif [[ ${EXIT_CODE} -eq 1 ]]; then
|
| 253 |
+
echo " Status : ERROR (${LOG_FILE} 확인 필요)"
|
| 254 |
+
else
|
| 255 |
+
echo " Status : FAILED (exit code ${EXIT_CODE}, ${LOG_FILE} 확인)"
|
| 256 |
+
fi
|
| 257 |
+
echo "=================================================================="
|
| 258 |
+
exit ${EXIT_CODE}
|
source/scripts/launch_3b_sft.sh
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_3b_sft.sh — 8-GPU FP8 SFT launcher for 3B Korean LLM
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/launch_3b_sft.sh
|
| 7 |
+
# bash scripts/launch_3b_sft.sh --max_steps 200 # quick test
|
| 8 |
+
# bash scripts/launch_3b_sft.sh --resume checkpoints/korean_3b_sft_v1/checkpoint-0002000
|
| 9 |
+
#
|
| 10 |
+
# Base model : checkpoints/korean_3b_fp8_run1/checkpoint-XXXXXX (기본값)
|
| 11 |
+
# --base_checkpoint 인자로 덮어쓸 수 있음
|
| 12 |
+
# SFT data : data/sft_combined/train_filtered.jsonl
|
| 13 |
+
# (먼저 scripts/prepare_sft_combined.sh → data/filter_sft_v2.py 실행)
|
| 14 |
+
#
|
| 15 |
+
# Effective batch: 2 (local) × 8 GPU × 4 (grad_accum) = 64 samples/step
|
| 16 |
+
# =============================================================================
|
| 17 |
+
set -euo pipefail
|
| 18 |
+
|
| 19 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 20 |
+
RUN_NAME="${RUN_NAME:-korean_3b_sft_v1}"
|
| 21 |
+
CONFIG="${CONFIG:-configs/korean_3b_sft.yaml}"
|
| 22 |
+
BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_3b_fp8_run1/checkpoint-0057000}"
|
| 23 |
+
SFT_DATA="${SFT_DATA:-data/sft_combined/train_filtered.jsonl}"
|
| 24 |
+
VAL_DATA="${VAL_DATA:-data/sft_combined/val_filtered.jsonl}"
|
| 25 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 26 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 27 |
+
NPROC=8
|
| 28 |
+
MASTER_PORT="${MASTER_PORT:-29503}"
|
| 29 |
+
|
| 30 |
+
MAX_STEPS=33000
|
| 31 |
+
BATCH_SIZE=2
|
| 32 |
+
GRAD_ACCUM=4
|
| 33 |
+
LR="1.0e-5"
|
| 34 |
+
WARMUP_STEPS=500
|
| 35 |
+
SEED=42
|
| 36 |
+
|
| 37 |
+
EXTRA_ARGS="$@"
|
| 38 |
+
|
| 39 |
+
# ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
|
| 40 |
+
export NCCL_IB_DISABLE=1
|
| 41 |
+
export NCCL_ALGO=Ring
|
| 42 |
+
export NCCL_PROTO=Simple
|
| 43 |
+
export NCCL_MIN_NCHANNELS=16
|
| 44 |
+
export NCCL_MAX_NCHANNELS=16
|
| 45 |
+
export NCCL_BUFFSIZE=67108864
|
| 46 |
+
export OMP_NUM_THREADS=4
|
| 47 |
+
export MKL_NUM_THREADS=4
|
| 48 |
+
|
| 49 |
+
# 3B 모델 VRAM 절약 — 동적 메모리 세그먼트 확장 허용
|
| 50 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 51 |
+
|
| 52 |
+
cd "$(dirname "$0")/.."
|
| 53 |
+
|
| 54 |
+
# ---- Pre-flight checks ------------------------------------------------------
|
| 55 |
+
if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
|
| 56 |
+
echo "=================================================================="
|
| 57 |
+
echo " ERROR: Base checkpoint 디렉토리를 찾을 수 없습니다."
|
| 58 |
+
echo " 경로: ${BASE_CHECKPOINT}"
|
| 59 |
+
echo ""
|
| 60 |
+
echo " --base_checkpoint 인자로 실제 경로를 지정하거나"
|
| 61 |
+
echo " BASE_CHECKPOINT 환경변수를 설정하세요."
|
| 62 |
+
echo " 예: bash scripts/launch_3b_sft.sh --base_checkpoint checkpoints/korean_3b_fp8_run1/checkpoint-0057000"
|
| 63 |
+
echo "=================================================================="
|
| 64 |
+
exit 1
|
| 65 |
+
fi
|
| 66 |
+
|
| 67 |
+
if [[ ! -f "${SFT_DATA}" ]]; then
|
| 68 |
+
echo "=================================================================="
|
| 69 |
+
echo " ERROR: SFT 학습 데이터를 찾을 수 없습니다: ${SFT_DATA}"
|
| 70 |
+
echo ""
|
| 71 |
+
echo " 데이터 준비 순서:"
|
| 72 |
+
echo " 1. bash scripts/prepare_sft_combined.sh"
|
| 73 |
+
echo " 2. python data/filter_sft_v2.py \\"
|
| 74 |
+
echo " --input data/sft_combined/train.jsonl \\"
|
| 75 |
+
echo " --output data/sft_combined/train_filtered.jsonl"
|
| 76 |
+
echo "=================================================================="
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
# val 파일 없으면 원본 val.jsonl 로 폴백
|
| 81 |
+
if [[ ! -f "${VAL_DATA}" ]]; then
|
| 82 |
+
VAL_FALLBACK="data/sft_combined/val.jsonl"
|
| 83 |
+
if [[ -f "${VAL_FALLBACK}" ]]; then
|
| 84 |
+
VAL_DATA="${VAL_FALLBACK}"
|
| 85 |
+
echo "[INFO] val_filtered 없음, 폴백: ${VAL_DATA}"
|
| 86 |
+
else
|
| 87 |
+
echo "ERROR: VAL_DATA 파일을 찾을 수 없습니다: ${VAL_DATA}"
|
| 88 |
+
exit 1
|
| 89 |
+
fi
|
| 90 |
+
fi
|
| 91 |
+
|
| 92 |
+
mkdir -p "${CKPT_DIR}"
|
| 93 |
+
|
| 94 |
+
echo "=================================================================="
|
| 95 |
+
echo " 3B SFT Fine-Tuning"
|
| 96 |
+
echo " Run name : ${RUN_NAME}"
|
| 97 |
+
echo " Config : ${CONFIG}"
|
| 98 |
+
echo " Base checkpoint : ${BASE_CHECKPOINT}"
|
| 99 |
+
echo " SFT data : ${SFT_DATA}"
|
| 100 |
+
echo " Val data : ${VAL_DATA}"
|
| 101 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 102 |
+
echo " Log file : ${LOG_FILE}"
|
| 103 |
+
echo " Max steps : ${MAX_STEPS}"
|
| 104 |
+
echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum = $((BATCH_SIZE * NPROC * GRAD_ACCUM)) eff_batch"
|
| 105 |
+
echo " Learning rate : ${LR}"
|
| 106 |
+
echo " Warmup : ${WARMUP_STEPS} steps"
|
| 107 |
+
echo " Master port : ${MASTER_PORT}"
|
| 108 |
+
echo " ALLOC_CONF : ${PYTORCH_CUDA_ALLOC_CONF}"
|
| 109 |
+
echo " Started : $(date)"
|
| 110 |
+
echo "=================================================================="
|
| 111 |
+
|
| 112 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 113 |
+
|
| 114 |
+
torchrun \
|
| 115 |
+
--nproc_per_node=${NPROC} \
|
| 116 |
+
--master_port=${MASTER_PORT} \
|
| 117 |
+
train/sft.py \
|
| 118 |
+
--config "${CONFIG}" \
|
| 119 |
+
--base_checkpoint "${BASE_CHECKPOINT}" \
|
| 120 |
+
--sft_data "${SFT_DATA}" \
|
| 121 |
+
--val_data "${VAL_DATA}" \
|
| 122 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 123 |
+
--log_file "${LOG_FILE}" \
|
| 124 |
+
--max_steps ${MAX_STEPS} \
|
| 125 |
+
--batch_size ${BATCH_SIZE} \
|
| 126 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 127 |
+
--lr ${LR} \
|
| 128 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 129 |
+
--seed ${SEED} \
|
| 130 |
+
--use_fp8 \
|
| 131 |
+
${EXTRA_ARGS} \
|
| 132 |
+
2>&1 | grep -v "UserWarning" \
|
| 133 |
+
| grep -v "Warning only once" \
|
| 134 |
+
| grep -v "Overriding a previously" \
|
| 135 |
+
| grep -v "dispatch key:" \
|
| 136 |
+
| grep -v "previous kernel:" \
|
| 137 |
+
| grep -v "new kernel:" \
|
| 138 |
+
| grep -v "operator: flash_attn" \
|
| 139 |
+
| grep -v "registered at /usr/local" \
|
| 140 |
+
| grep -v "self.m.impl" \
|
| 141 |
+
| tee -a "${LOG_FILE}"
|
| 142 |
+
|
| 143 |
+
echo "=================================================================="
|
| 144 |
+
echo " 3B SFT Done : $(date)"
|
| 145 |
+
echo "=================================================================="
|
source/scripts/launch_3b_sft_v2.sh
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_3b_sft_v2.sh — 8-GPU FP8 SFT v2 launcher for 3B Korean LLM
|
| 4 |
+
#
|
| 5 |
+
# SFT v2 improvements over v1:
|
| 6 |
+
# - LR: 1e-5 → 5e-5 (5x, resolve underfitting)
|
| 7 |
+
# - Effective batch: 64 → 256 (4x)
|
| 8 |
+
# - Data mixing: 70% SFT + 30% pretrain (forgetting prevention)
|
| 9 |
+
# - Weight decay: 0.01 → 0.05
|
| 10 |
+
# - Warmup: 500 → 2000 steps
|
| 11 |
+
# - Max steps: 33000 → 15000
|
| 12 |
+
#
|
| 13 |
+
# Usage:
|
| 14 |
+
# bash scripts/launch_3b_sft_v2.sh
|
| 15 |
+
# bash scripts/launch_3b_sft_v2.sh --max_steps 200 # quick test
|
| 16 |
+
# bash scripts/launch_3b_sft_v2.sh --resume checkpoints/korean_3b_sft_v2/checkpoint-0002000
|
| 17 |
+
#
|
| 18 |
+
# Effective batch: 4 (local) x 8 GPU x 8 (grad_accum) = 256 samples/step
|
| 19 |
+
# =============================================================================
|
| 20 |
+
set -euo pipefail
|
| 21 |
+
|
| 22 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 23 |
+
RUN_NAME="${RUN_NAME:-korean_3b_sft_v2}"
|
| 24 |
+
CONFIG="${CONFIG:-configs/korean_3b_sft_v2.yaml}"
|
| 25 |
+
BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_3b_fp8_run1/checkpoint-0057000}"
|
| 26 |
+
SFT_DATA="${SFT_DATA:-data/sft_combined/train_filtered.jsonl}"
|
| 27 |
+
VAL_DATA="${VAL_DATA:-data/sft_combined/val_filtered.jsonl}"
|
| 28 |
+
PRETRAIN_DATA="${PRETRAIN_DATA:-data/3b_train.bin}"
|
| 29 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 30 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 31 |
+
NPROC=8
|
| 32 |
+
MASTER_PORT="${MASTER_PORT:-29504}"
|
| 33 |
+
|
| 34 |
+
MAX_STEPS=15000
|
| 35 |
+
BATCH_SIZE=4
|
| 36 |
+
GRAD_ACCUM=8
|
| 37 |
+
LR="5.0e-5"
|
| 38 |
+
WARMUP_STEPS=2000
|
| 39 |
+
WEIGHT_DECAY=0.05
|
| 40 |
+
PRETRAIN_MIX_RATIO=0.3
|
| 41 |
+
SEED=42
|
| 42 |
+
|
| 43 |
+
EXTRA_ARGS="$@"
|
| 44 |
+
|
| 45 |
+
# ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
|
| 46 |
+
export NCCL_IB_DISABLE=1
|
| 47 |
+
export NCCL_ALGO=Ring
|
| 48 |
+
export NCCL_PROTO=Simple
|
| 49 |
+
export NCCL_MIN_NCHANNELS=16
|
| 50 |
+
export NCCL_MAX_NCHANNELS=16
|
| 51 |
+
export NCCL_BUFFSIZE=67108864
|
| 52 |
+
export OMP_NUM_THREADS=4
|
| 53 |
+
export MKL_NUM_THREADS=4
|
| 54 |
+
|
| 55 |
+
# 3B + bs=4 VRAM allocation
|
| 56 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 57 |
+
|
| 58 |
+
cd "$(dirname "$0")/.."
|
| 59 |
+
|
| 60 |
+
# ---- Pre-flight checks ------------------------------------------------------
|
| 61 |
+
if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
|
| 62 |
+
echo "=================================================================="
|
| 63 |
+
echo " ERROR: Base checkpoint not found: ${BASE_CHECKPOINT}"
|
| 64 |
+
echo " Set BASE_CHECKPOINT env var or use --base_checkpoint CLI arg."
|
| 65 |
+
echo "=================================================================="
|
| 66 |
+
exit 1
|
| 67 |
+
fi
|
| 68 |
+
|
| 69 |
+
if [[ ! -f "${SFT_DATA}" ]]; then
|
| 70 |
+
echo "=================================================================="
|
| 71 |
+
echo " ERROR: SFT data not found: ${SFT_DATA}"
|
| 72 |
+
echo " Run: bash scripts/prepare_sft_combined.sh"
|
| 73 |
+
echo "=================================================================="
|
| 74 |
+
exit 1
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
if [[ ! -f "${PRETRAIN_DATA}" ]]; then
|
| 78 |
+
echo "=================================================================="
|
| 79 |
+
echo " ERROR: Pretrain data not found: ${PRETRAIN_DATA}"
|
| 80 |
+
echo " Set PRETRAIN_DATA env var to the correct path."
|
| 81 |
+
echo "=================================================================="
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
|
| 85 |
+
# val fallback
|
| 86 |
+
if [[ ! -f "${VAL_DATA}" ]]; then
|
| 87 |
+
VAL_FALLBACK="data/sft_combined/val.jsonl"
|
| 88 |
+
if [[ -f "${VAL_FALLBACK}" ]]; then
|
| 89 |
+
VAL_DATA="${VAL_FALLBACK}"
|
| 90 |
+
echo "[INFO] val_filtered not found, fallback: ${VAL_DATA}"
|
| 91 |
+
else
|
| 92 |
+
echo "ERROR: VAL_DATA not found: ${VAL_DATA}"
|
| 93 |
+
exit 1
|
| 94 |
+
fi
|
| 95 |
+
fi
|
| 96 |
+
|
| 97 |
+
mkdir -p "${CKPT_DIR}"
|
| 98 |
+
|
| 99 |
+
echo "=================================================================="
|
| 100 |
+
echo " 3B SFT v2 Fine-Tuning"
|
| 101 |
+
echo " Run name : ${RUN_NAME}"
|
| 102 |
+
echo " Config : ${CONFIG}"
|
| 103 |
+
echo " Base checkpoint : ${BASE_CHECKPOINT}"
|
| 104 |
+
echo " SFT data : ${SFT_DATA}"
|
| 105 |
+
echo " Pretrain data : ${PRETRAIN_DATA}"
|
| 106 |
+
echo " Val data : ${VAL_DATA}"
|
| 107 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 108 |
+
echo " Log file : ${LOG_FILE}"
|
| 109 |
+
echo " Max steps : ${MAX_STEPS}"
|
| 110 |
+
echo " Batch size : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} grad_accum = $((BATCH_SIZE * NPROC * GRAD_ACCUM)) eff_batch"
|
| 111 |
+
echo " Learning rate : ${LR}"
|
| 112 |
+
echo " Weight decay : ${WEIGHT_DECAY}"
|
| 113 |
+
echo " Warmup : ${WARMUP_STEPS} steps"
|
| 114 |
+
echo " Data mixing : $((100 - ${PRETRAIN_MIX_RATIO%.*}0))% SFT + ${PRETRAIN_MIX_RATIO}00% pretrain"
|
| 115 |
+
echo " Master port : ${MASTER_PORT}"
|
| 116 |
+
echo " ALLOC_CONF : ${PYTORCH_CUDA_ALLOC_CONF}"
|
| 117 |
+
echo " Started : $(date)"
|
| 118 |
+
echo "=================================================================="
|
| 119 |
+
|
| 120 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 121 |
+
|
| 122 |
+
torchrun \
|
| 123 |
+
--nproc_per_node=${NPROC} \
|
| 124 |
+
--master_port=${MASTER_PORT} \
|
| 125 |
+
train/sft.py \
|
| 126 |
+
--config "${CONFIG}" \
|
| 127 |
+
--base_checkpoint "${BASE_CHECKPOINT}" \
|
| 128 |
+
--sft_data "${SFT_DATA}" \
|
| 129 |
+
--val_data "${VAL_DATA}" \
|
| 130 |
+
--pretrain_data "${PRETRAIN_DATA}" \
|
| 131 |
+
--pretrain_mix_ratio ${PRETRAIN_MIX_RATIO} \
|
| 132 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 133 |
+
--log_file "${LOG_FILE}" \
|
| 134 |
+
--max_steps ${MAX_STEPS} \
|
| 135 |
+
--batch_size ${BATCH_SIZE} \
|
| 136 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 137 |
+
--lr ${LR} \
|
| 138 |
+
--weight_decay ${WEIGHT_DECAY} \
|
| 139 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 140 |
+
--seed ${SEED} \
|
| 141 |
+
--use_fp8 \
|
| 142 |
+
${EXTRA_ARGS} \
|
| 143 |
+
2>&1 | grep -v "UserWarning" \
|
| 144 |
+
| grep -v "Warning only once" \
|
| 145 |
+
| grep -v "Overriding a previously" \
|
| 146 |
+
| grep -v "dispatch key:" \
|
| 147 |
+
| grep -v "previous kernel:" \
|
| 148 |
+
| grep -v "new kernel:" \
|
| 149 |
+
| grep -v "operator: flash_attn" \
|
| 150 |
+
| grep -v "registered at /usr/local" \
|
| 151 |
+
| grep -v "self.m.impl" \
|
| 152 |
+
| tee -a "${LOG_FILE}"
|
| 153 |
+
|
| 154 |
+
echo "=================================================================="
|
| 155 |
+
echo " 3B SFT v2 Done : $(date)"
|
| 156 |
+
echo "=================================================================="
|
source/scripts/launch_fp8.sh
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_fp8.sh — 8-GPU FP8 pretraining launcher for B200
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/launch_fp8.sh # full run
|
| 7 |
+
# bash scripts/launch_fp8.sh --max_steps 500 # quick test
|
| 8 |
+
# bash scripts/launch_fp8.sh --resume checkpoints/small_fp8_run1/checkpoint-0001000
|
| 9 |
+
#
|
| 10 |
+
# Config is read from configs/small_fp8.yaml (model) + CLI args (train).
|
| 11 |
+
# Logs: checkpoints/<RUN_NAME>/train.log
|
| 12 |
+
# checkpoints/<RUN_NAME>/tensorboard/
|
| 13 |
+
# =============================================================================
|
| 14 |
+
set -euo pipefail
|
| 15 |
+
|
| 16 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 17 |
+
RUN_NAME="${RUN_NAME:-small_fp8_run1}"
|
| 18 |
+
CONFIG="${CONFIG:-configs/small_fp8.yaml}"
|
| 19 |
+
TRAIN_DATA="${TRAIN_DATA:-data/train.bin}"
|
| 20 |
+
VAL_DATA="${VAL_DATA:-data/val.bin}"
|
| 21 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 22 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 23 |
+
NPROC=8
|
| 24 |
+
MASTER_PORT="${MASTER_PORT:-29500}"
|
| 25 |
+
|
| 26 |
+
# ---- Defaults that can be overridden via extra CLI args --------------------
|
| 27 |
+
MAX_STEPS=100000
|
| 28 |
+
BATCH_SIZE=8
|
| 29 |
+
GRAD_ACCUM=4
|
| 30 |
+
WARMUP_STEPS=2000
|
| 31 |
+
SEED=42
|
| 32 |
+
|
| 33 |
+
# ---- Pass remaining CLI args directly to pretrain.py ----------------------
|
| 34 |
+
EXTRA_ARGS="$@"
|
| 35 |
+
|
| 36 |
+
# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
|
| 37 |
+
# Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
|
| 38 |
+
export NCCL_IB_DISABLE=1
|
| 39 |
+
# Use Ring algorithm for large gradient tensors (128M-70B model range).
|
| 40 |
+
export NCCL_ALGO=Ring
|
| 41 |
+
# Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
|
| 42 |
+
export NCCL_PROTO=Simple
|
| 43 |
+
# More channels → better NVSwitch saturation for large all-reduce payloads.
|
| 44 |
+
export NCCL_MIN_NCHANNELS=16
|
| 45 |
+
export NCCL_MAX_NCHANNELS=16
|
| 46 |
+
# Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
|
| 47 |
+
export NCCL_BUFFSIZE=67108864
|
| 48 |
+
# CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
|
| 49 |
+
export OMP_NUM_THREADS=4
|
| 50 |
+
export MKL_NUM_THREADS=4
|
| 51 |
+
|
| 52 |
+
# ---- Setup ------------------------------------------------------------------
|
| 53 |
+
mkdir -p "${CKPT_DIR}"
|
| 54 |
+
cd "$(dirname "$0")/.." # always run from project root
|
| 55 |
+
|
| 56 |
+
echo "=================================================================="
|
| 57 |
+
echo " Run name : ${RUN_NAME}"
|
| 58 |
+
echo " Config : ${CONFIG}"
|
| 59 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 60 |
+
echo " Log file : ${LOG_FILE}"
|
| 61 |
+
echo " Started : $(date)"
|
| 62 |
+
echo "=================================================================="
|
| 63 |
+
|
| 64 |
+
# Suppress the harmless flash_attn kernel override warning from all ranks.
|
| 65 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 66 |
+
|
| 67 |
+
torchrun \
|
| 68 |
+
--nproc_per_node=${NPROC} \
|
| 69 |
+
--master_port=${MASTER_PORT} \
|
| 70 |
+
train/pretrain.py \
|
| 71 |
+
--config "${CONFIG}" \
|
| 72 |
+
--train_data "${TRAIN_DATA}" \
|
| 73 |
+
--val_data "${VAL_DATA}" \
|
| 74 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 75 |
+
--log_file "${LOG_FILE}" \
|
| 76 |
+
--max_steps ${MAX_STEPS} \
|
| 77 |
+
--batch_size ${BATCH_SIZE} \
|
| 78 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 79 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 80 |
+
--seed ${SEED} \
|
| 81 |
+
${EXTRA_ARGS} \
|
| 82 |
+
2>&1 | grep -v "UserWarning" \
|
| 83 |
+
| grep -v "Warning only once" \
|
| 84 |
+
| grep -v "Overriding a previously" \
|
| 85 |
+
| grep -v "dispatch key:" \
|
| 86 |
+
| grep -v "previous kernel:" \
|
| 87 |
+
| grep -v "new kernel:" \
|
| 88 |
+
| grep -v "operator: flash_attn" \
|
| 89 |
+
| grep -v "registered at /usr/local" \
|
| 90 |
+
| grep -v "self.m.impl"
|
| 91 |
+
|
| 92 |
+
echo "=================================================================="
|
| 93 |
+
echo " Done : $(date)"
|
| 94 |
+
echo "=================================================================="
|
source/scripts/launch_hybrid_3b.sh
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# ============================================================================
|
| 3 |
+
# FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer 학습 런치 스크립트
|
| 4 |
+
# ============================================================================
|
| 5 |
+
#
|
| 6 |
+
# 사용법:
|
| 7 |
+
# nohup setsid bash scripts/launch_hybrid_3b.sh > logs/hybrid_3b.log 2>&1 &
|
| 8 |
+
#
|
| 9 |
+
# SIGHUP 방어: nohup + setsid 조합으로 SSH 끊김에도 학습 유지
|
| 10 |
+
# ============================================================================
|
| 11 |
+
|
| 12 |
+
set -euo pipefail
|
| 13 |
+
|
| 14 |
+
# ---- 환경 변수 ----
|
| 15 |
+
export OMP_NUM_THREADS=4
|
| 16 |
+
export NCCL_ALGO=NVLS # NVSwitch 최적 알고리즘
|
| 17 |
+
export NCCL_IB_DISABLE=1 # InfiniBand 비활성 (단일 노드)
|
| 18 |
+
export NCCL_P2P_LEVEL=NVL # NVLink P2P
|
| 19 |
+
export NCCL_NET_GDR_LEVEL=0 # GPU Direct RDMA 비활성 (단일 노드)
|
| 20 |
+
|
| 21 |
+
# ---- 경로 ----
|
| 22 |
+
PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
|
| 23 |
+
CONFIG="${PROJECT_ROOT}/configs/hybrid_3b.yaml"
|
| 24 |
+
TRAIN_DATA="${PROJECT_ROOT}/data/3b_train.bin"
|
| 25 |
+
VAL_DATA="${PROJECT_ROOT}/data/3b_val.bin"
|
| 26 |
+
CKPT_DIR="${PROJECT_ROOT}/checkpoints/hybrid_3b_run1"
|
| 27 |
+
LOG_FILE="${PROJECT_ROOT}/logs/hybrid_3b_train.log"
|
| 28 |
+
|
| 29 |
+
# ---- 디렉토리 생성 ----
|
| 30 |
+
mkdir -p "${CKPT_DIR}"
|
| 31 |
+
mkdir -p "$(dirname ${LOG_FILE})"
|
| 32 |
+
|
| 33 |
+
cd "${PROJECT_ROOT}"
|
| 34 |
+
|
| 35 |
+
echo "============================================"
|
| 36 |
+
echo " FRANKENSTALLM-H 3B Hybrid Training"
|
| 37 |
+
echo " Config: ${CONFIG}"
|
| 38 |
+
echo " Data: ${TRAIN_DATA}"
|
| 39 |
+
echo " Checkpoint: ${CKPT_DIR}"
|
| 40 |
+
echo " Started: $(date '+%Y-%m-%d %H:%M:%S')"
|
| 41 |
+
echo "============================================"
|
| 42 |
+
|
| 43 |
+
# ---- 학습 실행 (8 GPU DDP) ----
|
| 44 |
+
torchrun \
|
| 45 |
+
--nproc_per_node=8 \
|
| 46 |
+
--master_port=29500 \
|
| 47 |
+
train/pretrain.py \
|
| 48 |
+
--config "${CONFIG}" \
|
| 49 |
+
--train_data "${TRAIN_DATA}" \
|
| 50 |
+
--val_data "${VAL_DATA}" \
|
| 51 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 52 |
+
--batch_size 4 \
|
| 53 |
+
--lr 2e-4 \
|
| 54 |
+
--weight_decay 0.1 \
|
| 55 |
+
--warmup_steps 2000 \
|
| 56 |
+
--grad_accum 8 \
|
| 57 |
+
--max_steps 57000 \
|
| 58 |
+
--log_file "${LOG_FILE}" \
|
| 59 |
+
--use_fp8 \
|
| 60 |
+
"$@"
|
| 61 |
+
|
| 62 |
+
echo "Training finished at $(date '+%Y-%m-%d %H:%M:%S')"
|
source/scripts/launch_korean_1b.sh
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_korean_1b.sh — 8-GPU FP8 pretraining launcher for 1B Korean LLM
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/launch_korean_1b.sh # full run
|
| 7 |
+
# bash scripts/launch_korean_1b.sh --max_steps 500 # quick test
|
| 8 |
+
# bash scripts/launch_korean_1b.sh --resume checkpoints/korean_1b_fp8_run1/checkpoint-0010000
|
| 9 |
+
#
|
| 10 |
+
# Config is read from configs/korean_1b_fp8.yaml (model) + CLI args (train).
|
| 11 |
+
# Effective batch size: 8 (local) × 8 GPU × 4 (grad_accum) × 4096 (seq_len)
|
| 12 |
+
# = 1,048,576 tokens / step
|
| 13 |
+
# Logs: checkpoints/<RUN_NAME>/train.log
|
| 14 |
+
# checkpoints/<RUN_NAME>/tensorboard/
|
| 15 |
+
# =============================================================================
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 19 |
+
RUN_NAME="${RUN_NAME:-korean_1b_fp8_run1}"
|
| 20 |
+
CONFIG="${CONFIG:-configs/korean_1b_fp8.yaml}"
|
| 21 |
+
TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
|
| 22 |
+
VAL_DATA="${VAL_DATA:-data/korean_val.bin}"
|
| 23 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 24 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 25 |
+
NPROC=8
|
| 26 |
+
MASTER_PORT="${MASTER_PORT:-29501}"
|
| 27 |
+
|
| 28 |
+
# ---- Defaults that can be overridden via extra CLI args --------------------
|
| 29 |
+
MAX_STEPS=34000 # 4 에포크 × 8.91B tokens = 35.6B (Muennighoff 2023: 4에포크 초과 시 val loss 상승)
|
| 30 |
+
BATCH_SIZE=8
|
| 31 |
+
GRAD_ACCUM=4
|
| 32 |
+
WARMUP_STEPS=2000 # 34k steps의 5.9% (기존 4000 = 11.8%로 과도)
|
| 33 |
+
SEED=42
|
| 34 |
+
|
| 35 |
+
# ---- Pass remaining CLI args directly to pretrain.py ----------------------
|
| 36 |
+
EXTRA_ARGS="$@"
|
| 37 |
+
|
| 38 |
+
# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
|
| 39 |
+
# Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
|
| 40 |
+
export NCCL_IB_DISABLE=1
|
| 41 |
+
# Use Ring algorithm for large gradient tensors (128M-70B model range).
|
| 42 |
+
export NCCL_ALGO=Ring
|
| 43 |
+
# Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
|
| 44 |
+
export NCCL_PROTO=Simple
|
| 45 |
+
# More channels → better NVSwitch saturation for large all-reduce payloads.
|
| 46 |
+
export NCCL_MIN_NCHANNELS=16
|
| 47 |
+
export NCCL_MAX_NCHANNELS=16
|
| 48 |
+
# Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
|
| 49 |
+
export NCCL_BUFFSIZE=67108864
|
| 50 |
+
# CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
|
| 51 |
+
export OMP_NUM_THREADS=4
|
| 52 |
+
export MKL_NUM_THREADS=4
|
| 53 |
+
|
| 54 |
+
# ---- Setup ------------------------------------------------------------------
|
| 55 |
+
cd "$(dirname "$0")/.." # always run from project root
|
| 56 |
+
|
| 57 |
+
# ---- Pre-flight check: Korean data must exist before launching --------------
|
| 58 |
+
if [[ ! -f "${TRAIN_DATA}" ]]; then
|
| 59 |
+
echo "=================================================================="
|
| 60 |
+
echo " ERROR: Training data not found: ${TRAIN_DATA}"
|
| 61 |
+
echo ""
|
| 62 |
+
echo " You need to run the Korean data pipeline first."
|
| 63 |
+
echo " Example steps:"
|
| 64 |
+
echo " 1. Download / prepare raw Korean corpus"
|
| 65 |
+
echo " 2. Tokenise and pack into binary format:"
|
| 66 |
+
echo " python data/prepare_korean.py --output data/korean_train.bin"
|
| 67 |
+
echo " 3. Re-run this script once the file exists."
|
| 68 |
+
echo "=================================================================="
|
| 69 |
+
exit 1
|
| 70 |
+
fi
|
| 71 |
+
|
| 72 |
+
if [[ ! -f "${VAL_DATA}" ]]; then
|
| 73 |
+
echo "=================================================================="
|
| 74 |
+
echo " ERROR: Validation data not found: ${VAL_DATA}"
|
| 75 |
+
echo ""
|
| 76 |
+
echo " You need to run the Korean data pipeline first."
|
| 77 |
+
echo " Example steps:"
|
| 78 |
+
echo " 1. Download / prepare raw Korean corpus"
|
| 79 |
+
echo " 2. Tokenise and pack into binary format (val split):"
|
| 80 |
+
echo " python data/prepare_korean.py --output_val data/korean_val.bin"
|
| 81 |
+
echo " 3. Re-run this script once the file exists."
|
| 82 |
+
echo "=================================================================="
|
| 83 |
+
exit 1
|
| 84 |
+
fi
|
| 85 |
+
|
| 86 |
+
mkdir -p "${CKPT_DIR}"
|
| 87 |
+
|
| 88 |
+
echo "=================================================================="
|
| 89 |
+
echo " Run name : ${RUN_NAME}"
|
| 90 |
+
echo " Config : ${CONFIG}"
|
| 91 |
+
echo " Train data : ${TRAIN_DATA}"
|
| 92 |
+
echo " Val data : ${VAL_DATA}"
|
| 93 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 94 |
+
echo " Log file : ${LOG_FILE}"
|
| 95 |
+
echo " Max steps : ${MAX_STEPS}"
|
| 96 |
+
echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
|
| 97 |
+
echo " Warmup : ${WARMUP_STEPS} steps"
|
| 98 |
+
echo " Master port : ${MASTER_PORT}"
|
| 99 |
+
echo " Started : $(date)"
|
| 100 |
+
echo "=================================================================="
|
| 101 |
+
|
| 102 |
+
# Suppress the harmless flash_attn kernel override warning from all ranks.
|
| 103 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 104 |
+
|
| 105 |
+
torchrun \
|
| 106 |
+
--nproc_per_node=${NPROC} \
|
| 107 |
+
--master_port=${MASTER_PORT} \
|
| 108 |
+
train/pretrain.py \
|
| 109 |
+
--config "${CONFIG}" \
|
| 110 |
+
--train_data "${TRAIN_DATA}" \
|
| 111 |
+
--val_data "${VAL_DATA}" \
|
| 112 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 113 |
+
--log_file "${LOG_FILE}" \
|
| 114 |
+
--max_steps ${MAX_STEPS} \
|
| 115 |
+
--batch_size ${BATCH_SIZE} \
|
| 116 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 117 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 118 |
+
--seed ${SEED} \
|
| 119 |
+
${EXTRA_ARGS} \
|
| 120 |
+
2>&1 | grep -v "UserWarning" \
|
| 121 |
+
| grep -v "Warning only once" \
|
| 122 |
+
| grep -v "Overriding a previously" \
|
| 123 |
+
| grep -v "dispatch key:" \
|
| 124 |
+
| grep -v "previous kernel:" \
|
| 125 |
+
| grep -v "new kernel:" \
|
| 126 |
+
| grep -v "operator: flash_attn" \
|
| 127 |
+
| grep -v "registered at /usr/local" \
|
| 128 |
+
| grep -v "self.m.impl" \
|
| 129 |
+
| tee -a "${LOG_FILE}"
|
| 130 |
+
|
| 131 |
+
echo "=================================================================="
|
| 132 |
+
echo " Done : $(date)"
|
| 133 |
+
echo "=================================================================="
|
source/scripts/launch_korean_3b.sh
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_korean_3b.sh — 8-GPU FP8 pretraining launcher for 3B Korean LLM
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/launch_korean_3b.sh # full run (~60B tokens)
|
| 7 |
+
# bash scripts/launch_korean_3b.sh --max_steps 50 # quick benchmark
|
| 8 |
+
# bash scripts/launch_korean_3b.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
|
| 9 |
+
#
|
| 10 |
+
# Effective batch size: 8 (local) × 8 GPU × 4 (grad_accum) × 4096 (seq_len)
|
| 11 |
+
# = 1,048,576 tokens / step
|
| 12 |
+
# =============================================================================
|
| 13 |
+
set -euo pipefail
|
| 14 |
+
|
| 15 |
+
RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
|
| 16 |
+
CONFIG="${CONFIG:-configs/3b_pretrain.yaml}"
|
| 17 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 18 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 19 |
+
NPROC=8
|
| 20 |
+
MASTER_PORT="${MASTER_PORT:-29502}"
|
| 21 |
+
|
| 22 |
+
MAX_STEPS=57000
|
| 23 |
+
BATCH_SIZE=4
|
| 24 |
+
GRAD_ACCUM=8
|
| 25 |
+
LR=1.5e-4
|
| 26 |
+
WARMUP_STEPS=2000
|
| 27 |
+
SEED=42
|
| 28 |
+
|
| 29 |
+
EXTRA_ARGS="$@"
|
| 30 |
+
|
| 31 |
+
# ---- B200 / NVSwitch NCCL tuning -------------------------------------------
|
| 32 |
+
export NCCL_IB_DISABLE=1
|
| 33 |
+
export NCCL_ALGO=Ring
|
| 34 |
+
export NCCL_PROTO=Simple
|
| 35 |
+
export NCCL_MIN_NCHANNELS=16
|
| 36 |
+
export NCCL_MAX_NCHANNELS=16
|
| 37 |
+
export NCCL_BUFFSIZE=67108864
|
| 38 |
+
export OMP_NUM_THREADS=4
|
| 39 |
+
export MKL_NUM_THREADS=4
|
| 40 |
+
|
| 41 |
+
# cd FIRST — 이후 상대경로 체크가 프로젝트 루트 기준으로 동작
|
| 42 |
+
cd "$(dirname "$0")/.."
|
| 43 |
+
|
| 44 |
+
# TRAIN_DATA fallback: cd 이후에 상대경로 체크
|
| 45 |
+
if [[ -f "data/merged_3b_train.bin" ]]; then
|
| 46 |
+
TRAIN_DATA="${TRAIN_DATA:-data/merged_3b_train.bin}"
|
| 47 |
+
echo "Using merged training data: data/merged_3b_train.bin"
|
| 48 |
+
elif [[ -f "data/korean_train.bin" ]]; then
|
| 49 |
+
TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
|
| 50 |
+
echo "Using fallback training data: data/korean_train.bin"
|
| 51 |
+
else
|
| 52 |
+
echo "ERROR: No training data found (data/merged_3b_train.bin or data/korean_train.bin)"
|
| 53 |
+
exit 1
|
| 54 |
+
fi
|
| 55 |
+
|
| 56 |
+
# VAL_DATA fallback: cd 이후에 상대경로 체크
|
| 57 |
+
VAL_DATA="${VAL_DATA:-data/merged_3b_val.bin}"
|
| 58 |
+
if [[ ! -f "${VAL_DATA}" ]]; then
|
| 59 |
+
VAL_DATA="data/korean_val.bin"
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
if [[ ! -f "${TRAIN_DATA}" ]]; then
|
| 63 |
+
echo "ERROR: Training data not found: ${TRAIN_DATA}"
|
| 64 |
+
exit 1
|
| 65 |
+
fi
|
| 66 |
+
if [[ ! -f "${VAL_DATA}" ]]; then
|
| 67 |
+
echo "ERROR: Validation data not found: ${VAL_DATA}"
|
| 68 |
+
exit 1
|
| 69 |
+
fi
|
| 70 |
+
|
| 71 |
+
mkdir -p "${CKPT_DIR}"
|
| 72 |
+
|
| 73 |
+
echo "=================================================================="
|
| 74 |
+
echo " Run name : ${RUN_NAME}"
|
| 75 |
+
echo " Config : ${CONFIG}"
|
| 76 |
+
echo " Train data : ${TRAIN_DATA}"
|
| 77 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 78 |
+
echo " Max steps : ${MAX_STEPS}"
|
| 79 |
+
echo " LR : ${LR}"
|
| 80 |
+
echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
|
| 81 |
+
echo " Started : $(date)"
|
| 82 |
+
echo "=================================================================="
|
| 83 |
+
|
| 84 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 85 |
+
|
| 86 |
+
torchrun \
|
| 87 |
+
--nproc_per_node=${NPROC} \
|
| 88 |
+
--master_port=${MASTER_PORT} \
|
| 89 |
+
train/pretrain.py \
|
| 90 |
+
--config "${CONFIG}" \
|
| 91 |
+
--train_data "${TRAIN_DATA}" \
|
| 92 |
+
--val_data "${VAL_DATA}" \
|
| 93 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 94 |
+
--log_file "${LOG_FILE}" \
|
| 95 |
+
--max_steps ${MAX_STEPS} \
|
| 96 |
+
--batch_size ${BATCH_SIZE} \
|
| 97 |
+
--lr ${LR} \
|
| 98 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 99 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 100 |
+
--seed ${SEED} \
|
| 101 |
+
${EXTRA_ARGS} \
|
| 102 |
+
2>&1 | grep -v "UserWarning" \
|
| 103 |
+
| grep -v "Warning only once" \
|
| 104 |
+
| grep -v "Overriding a previously" \
|
| 105 |
+
| grep -v "dispatch key:" \
|
| 106 |
+
| grep -v "previous kernel:" \
|
| 107 |
+
| grep -v "new kernel:" \
|
| 108 |
+
| grep -v "operator: flash_attn" \
|
| 109 |
+
| grep -v "registered at /usr/local" \
|
| 110 |
+
| grep -v "self.m.impl" \
|
| 111 |
+
| tee -a "${LOG_FILE}"
|
| 112 |
+
|
| 113 |
+
echo "=================================================================="
|
| 114 |
+
echo " Done : $(date)"
|
| 115 |
+
echo "=================================================================="
|
source/scripts/launch_sft.sh
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# launch_sft.sh — 8-GPU FP8 SFT launcher for 1B Korean LLM
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/launch_sft.sh
|
| 7 |
+
# bash scripts/launch_sft.sh --max_steps 500 # quick test
|
| 8 |
+
# bash scripts/launch_sft.sh --resume checkpoints/korean_1b_sft/checkpoint-0001000
|
| 9 |
+
#
|
| 10 |
+
# Base model: checkpoints/korean_1b_fp8_run1/checkpoint-0034000
|
| 11 |
+
# SFT data: data/sft/train.jsonl
|
| 12 |
+
# =============================================================================
|
| 13 |
+
set -euo pipefail
|
| 14 |
+
|
| 15 |
+
# ---- Configurable defaults --------------------------------------------------
|
| 16 |
+
RUN_NAME="${RUN_NAME:-korean_1b_sft}"
|
| 17 |
+
BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_1b_fp8_run1/checkpoint-0034000}"
|
| 18 |
+
SFT_DATA="${SFT_DATA:-data/sft/train.jsonl}"
|
| 19 |
+
VAL_DATA="${VAL_DATA:-data/sft/val.jsonl}"
|
| 20 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 21 |
+
LOG_FILE="${CKPT_DIR}/train.log"
|
| 22 |
+
NPROC=8
|
| 23 |
+
MASTER_PORT="${MASTER_PORT:-29502}"
|
| 24 |
+
|
| 25 |
+
MAX_STEPS=9000
|
| 26 |
+
BATCH_SIZE=4
|
| 27 |
+
GRAD_ACCUM=2
|
| 28 |
+
LR="2.0e-5"
|
| 29 |
+
WARMUP_STEPS=300
|
| 30 |
+
SEED=42
|
| 31 |
+
|
| 32 |
+
EXTRA_ARGS="$@"
|
| 33 |
+
|
| 34 |
+
# ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
|
| 35 |
+
export NCCL_IB_DISABLE=1
|
| 36 |
+
export NCCL_ALGO=Ring
|
| 37 |
+
export NCCL_PROTO=Simple
|
| 38 |
+
export NCCL_MIN_NCHANNELS=16
|
| 39 |
+
export NCCL_MAX_NCHANNELS=16
|
| 40 |
+
export NCCL_BUFFSIZE=67108864
|
| 41 |
+
export OMP_NUM_THREADS=4
|
| 42 |
+
export MKL_NUM_THREADS=4
|
| 43 |
+
|
| 44 |
+
cd "$(dirname "$0")/.."
|
| 45 |
+
|
| 46 |
+
# ---- Pre-flight checks ------------------------------------------------------
|
| 47 |
+
if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
|
| 48 |
+
echo "ERROR: Base checkpoint not found: ${BASE_CHECKPOINT}"
|
| 49 |
+
exit 1
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
if [[ ! -f "${SFT_DATA}" ]]; then
|
| 53 |
+
echo "=================================================================="
|
| 54 |
+
echo " ERROR: SFT training data not found: ${SFT_DATA}"
|
| 55 |
+
echo ""
|
| 56 |
+
echo " Run the data preparation script first:"
|
| 57 |
+
echo " python data/prepare_sft_data.py"
|
| 58 |
+
echo "=================================================================="
|
| 59 |
+
exit 1
|
| 60 |
+
fi
|
| 61 |
+
|
| 62 |
+
mkdir -p "${CKPT_DIR}"
|
| 63 |
+
|
| 64 |
+
echo "=================================================================="
|
| 65 |
+
echo " SFT Fine-Tuning"
|
| 66 |
+
echo " Run name : ${RUN_NAME}"
|
| 67 |
+
echo " Base checkpoint : ${BASE_CHECKPOINT}"
|
| 68 |
+
echo " SFT data : ${SFT_DATA}"
|
| 69 |
+
echo " CKPT dir : ${CKPT_DIR}"
|
| 70 |
+
echo " Log file : ${LOG_FILE}"
|
| 71 |
+
echo " Max steps : ${MAX_STEPS}"
|
| 72 |
+
echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
|
| 73 |
+
echo " Learning rate : ${LR}"
|
| 74 |
+
echo " Warmup : ${WARMUP_STEPS} steps"
|
| 75 |
+
echo " Master port : ${MASTER_PORT}"
|
| 76 |
+
echo " Started : $(date)"
|
| 77 |
+
echo "=================================================================="
|
| 78 |
+
|
| 79 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 80 |
+
|
| 81 |
+
torchrun \
|
| 82 |
+
--nproc_per_node=${NPROC} \
|
| 83 |
+
--master_port=${MASTER_PORT} \
|
| 84 |
+
train/sft.py \
|
| 85 |
+
--base_checkpoint "${BASE_CHECKPOINT}" \
|
| 86 |
+
--sft_data "${SFT_DATA}" \
|
| 87 |
+
--checkpoint_dir "${CKPT_DIR}" \
|
| 88 |
+
--log_file "${LOG_FILE}" \
|
| 89 |
+
--max_steps ${MAX_STEPS} \
|
| 90 |
+
--batch_size ${BATCH_SIZE} \
|
| 91 |
+
--grad_accum ${GRAD_ACCUM} \
|
| 92 |
+
--lr ${LR} \
|
| 93 |
+
--warmup_steps ${WARMUP_STEPS} \
|
| 94 |
+
--seed ${SEED} \
|
| 95 |
+
--use_fp8 \
|
| 96 |
+
--val_data "${VAL_DATA}" \
|
| 97 |
+
${EXTRA_ARGS} \
|
| 98 |
+
2>&1 | grep -v "UserWarning" \
|
| 99 |
+
| grep -v "Warning only once" \
|
| 100 |
+
| grep -v "Overriding a previously" \
|
| 101 |
+
| grep -v "dispatch key:" \
|
| 102 |
+
| grep -v "previous kernel:" \
|
| 103 |
+
| grep -v "new kernel:" \
|
| 104 |
+
| grep -v "operator: flash_attn" \
|
| 105 |
+
| grep -v "registered at /usr/local" \
|
| 106 |
+
| grep -v "self.m.impl" \
|
| 107 |
+
| tee -a "${LOG_FILE}"
|
| 108 |
+
|
| 109 |
+
echo "=================================================================="
|
| 110 |
+
echo " SFT Done : $(date)"
|
| 111 |
+
echo "=================================================================="
|
source/scripts/migrate_qkv_checkpoint.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Migrate checkpoint from separate Q/K/V projections to fused QKV.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python3 scripts/migrate_qkv_checkpoint.py <checkpoint_dir>
|
| 6 |
+
|
| 7 |
+
Migrates both model.pt AND optimizer.pt:
|
| 8 |
+
- model.pt: q_proj/k_proj/v_proj weights → qkv_proj weight
|
| 9 |
+
- optimizer.pt: exp_avg/exp_avg_sq states fused, param indices re-mapped
|
| 10 |
+
|
| 11 |
+
The concatenation order is [Q ; K ; V] along the output (dim-0) axis,
|
| 12 |
+
which matches the split in MultiHeadAttention.forward:
|
| 13 |
+
q, k, v = qkv.split([_q_dim, _kv_dim, _kv_dim], dim=-1)
|
| 14 |
+
|
| 15 |
+
Optimizer layout (group 0 = weight_decay, per layer × 28):
|
| 16 |
+
[i*6+0] q_proj.weight [3072, 3072]
|
| 17 |
+
[i*6+1] k_proj.weight [1024, 3072]
|
| 18 |
+
[i*6+2] v_proj.weight [1024, 3072]
|
| 19 |
+
[i*6+3] out_proj.weight [3072, 3072]
|
| 20 |
+
[i*6+4] fc1_weight [16384, 3072]
|
| 21 |
+
[i*6+5] fc2_weight [3072, 8192]
|
| 22 |
+
After fusion: indices 0,1,2 → single qkv_proj → 4 params per layer.
|
| 23 |
+
"""
|
| 24 |
+
import sys
|
| 25 |
+
import torch
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
|
| 28 |
+
N_LAYERS = 28
|
| 29 |
+
OLD_PARAMS_PER_LAYER = 6 # q, k, v, out, fc1, fc2
|
| 30 |
+
NEW_PARAMS_PER_LAYER = 4 # qkv, out, fc1, fc2
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def migrate_model(state: dict) -> dict:
|
| 34 |
+
"""Fuse Q/K/V projection weights into QKV in model state dict."""
|
| 35 |
+
new_state: dict = {}
|
| 36 |
+
layers_done: set = set()
|
| 37 |
+
|
| 38 |
+
for key, val in state.items():
|
| 39 |
+
if ".q_proj." not in key and ".k_proj." not in key and ".v_proj." not in key:
|
| 40 |
+
new_state[key] = val
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
if ".q_proj." not in key:
|
| 44 |
+
continue
|
| 45 |
+
|
| 46 |
+
prefix = key.rsplit(".", 2)[0]
|
| 47 |
+
suffix = key.rsplit(".", 1)[-1]
|
| 48 |
+
|
| 49 |
+
tag = (prefix, suffix)
|
| 50 |
+
if tag in layers_done:
|
| 51 |
+
continue
|
| 52 |
+
layers_done.add(tag)
|
| 53 |
+
|
| 54 |
+
q_key = f"{prefix}.q_proj.{suffix}"
|
| 55 |
+
k_key = f"{prefix}.k_proj.{suffix}"
|
| 56 |
+
v_key = f"{prefix}.v_proj.{suffix}"
|
| 57 |
+
|
| 58 |
+
missing = [k for k in (q_key, k_key, v_key) if k not in state]
|
| 59 |
+
if missing:
|
| 60 |
+
raise KeyError(f"Expected keys not found in checkpoint: {missing}")
|
| 61 |
+
|
| 62 |
+
q_w, k_w, v_w = state[q_key], state[k_key], state[v_key]
|
| 63 |
+
fused = torch.cat([q_w, k_w, v_w], dim=0)
|
| 64 |
+
fused_key = f"{prefix}.qkv_proj.{suffix}"
|
| 65 |
+
new_state[fused_key] = fused
|
| 66 |
+
print(f" Fused {fused_key}: {list(fused.shape)}"
|
| 67 |
+
f" (q={list(q_w.shape)}, k={list(k_w.shape)}, v={list(v_w.shape)})")
|
| 68 |
+
|
| 69 |
+
leaked = [k for k in new_state if ".q_proj." in k or ".k_proj." in k or ".v_proj." in k]
|
| 70 |
+
if leaked:
|
| 71 |
+
raise RuntimeError(f"BUG: old projection keys still present: {leaked}")
|
| 72 |
+
|
| 73 |
+
return new_state
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def migrate_optimizer(opt_state: dict) -> dict:
|
| 77 |
+
"""Fuse optimizer states for Q/K/V → QKV and re-index parameters.
|
| 78 |
+
|
| 79 |
+
The optimizer has 2 param groups:
|
| 80 |
+
Group 0 (weight_decay): 168 = 28 layers × 6 (q,k,v,out,fc1,fc2)
|
| 81 |
+
Group 1 (no weight_decay): 58 = norms + embedding
|
| 82 |
+
|
| 83 |
+
We fuse q,k,v entries in group 0 (indices i*6+0,1,2 → one entry per layer).
|
| 84 |
+
Group 0 shrinks from 168 to 112 (28 layers × 4 params).
|
| 85 |
+
Group 1 stays at 58. Total: 170.
|
| 86 |
+
"""
|
| 87 |
+
old_state = opt_state["state"]
|
| 88 |
+
old_groups = opt_state["param_groups"]
|
| 89 |
+
|
| 90 |
+
group0_count = len(old_groups[0]["params"])
|
| 91 |
+
expected_g0 = N_LAYERS * OLD_PARAMS_PER_LAYER
|
| 92 |
+
if group0_count != expected_g0:
|
| 93 |
+
raise ValueError(
|
| 94 |
+
f"Group 0 has {group0_count} params, expected {expected_g0}. "
|
| 95 |
+
f"Cannot auto-detect QKV layout."
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Validate shapes for first layer
|
| 99 |
+
shapes = []
|
| 100 |
+
for j in range(OLD_PARAMS_PER_LAYER):
|
| 101 |
+
idx = old_groups[0]["params"][j]
|
| 102 |
+
shapes.append(list(old_state[idx]["exp_avg"].shape))
|
| 103 |
+
expected_shapes = [[3072, 3072], [1024, 3072], [1024, 3072],
|
| 104 |
+
[3072, 3072], [16384, 3072], [3072, 8192]]
|
| 105 |
+
if shapes != expected_shapes:
|
| 106 |
+
raise ValueError(
|
| 107 |
+
f"Layer 0 shapes {shapes} don't match expected {expected_shapes}. "
|
| 108 |
+
f"Cannot auto-detect QKV layout."
|
| 109 |
+
)
|
| 110 |
+
print(f" Shape validation passed for layer 0.")
|
| 111 |
+
|
| 112 |
+
new_state_entries = {}
|
| 113 |
+
new_idx = 0
|
| 114 |
+
|
| 115 |
+
# --- Group 0: fuse q/k/v per layer ---
|
| 116 |
+
for layer_i in range(N_LAYERS):
|
| 117 |
+
base = layer_i * OLD_PARAMS_PER_LAYER
|
| 118 |
+
q_opt_idx = old_groups[0]["params"][base + 0]
|
| 119 |
+
k_opt_idx = old_groups[0]["params"][base + 1]
|
| 120 |
+
v_opt_idx = old_groups[0]["params"][base + 2]
|
| 121 |
+
|
| 122 |
+
q_entry = old_state[q_opt_idx]
|
| 123 |
+
k_entry = old_state[k_opt_idx]
|
| 124 |
+
v_entry = old_state[v_opt_idx]
|
| 125 |
+
|
| 126 |
+
# Fuse QKV
|
| 127 |
+
fused_entry = {"step": q_entry["step"]}
|
| 128 |
+
for field in ["exp_avg", "exp_avg_sq"]:
|
| 129 |
+
if field in q_entry:
|
| 130 |
+
fused_entry[field] = torch.cat(
|
| 131 |
+
[q_entry[field], k_entry[field], v_entry[field]], dim=0
|
| 132 |
+
)
|
| 133 |
+
new_state_entries[new_idx] = fused_entry
|
| 134 |
+
if layer_i == 0:
|
| 135 |
+
print(f" Layer 0 QKV fused: exp_avg {list(fused_entry['exp_avg'].shape)}")
|
| 136 |
+
new_idx += 1
|
| 137 |
+
|
| 138 |
+
# Copy remaining params (out, fc1, fc2)
|
| 139 |
+
for offset in [3, 4, 5]:
|
| 140 |
+
opt_idx = old_groups[0]["params"][base + offset]
|
| 141 |
+
new_state_entries[new_idx] = old_state[opt_idx]
|
| 142 |
+
new_idx += 1
|
| 143 |
+
|
| 144 |
+
new_group0_count = new_idx # should be N_LAYERS * NEW_PARAMS_PER_LAYER = 112
|
| 145 |
+
print(f" Group 0: {group0_count} → {new_group0_count} params")
|
| 146 |
+
|
| 147 |
+
# --- Group 1: copy as-is (norms, embedding — no QKV) ---
|
| 148 |
+
group1_count = len(old_groups[1]["params"])
|
| 149 |
+
for j in range(group1_count):
|
| 150 |
+
opt_idx = old_groups[1]["params"][j]
|
| 151 |
+
if opt_idx in old_state:
|
| 152 |
+
new_state_entries[new_idx] = old_state[opt_idx]
|
| 153 |
+
new_idx += 1
|
| 154 |
+
print(f" Group 1: {group1_count} → {group1_count} params (unchanged)")
|
| 155 |
+
|
| 156 |
+
# Build new param_groups
|
| 157 |
+
new_groups = []
|
| 158 |
+
g0 = {k: v for k, v in old_groups[0].items() if k != "params"}
|
| 159 |
+
g0["params"] = list(range(0, new_group0_count))
|
| 160 |
+
new_groups.append(g0)
|
| 161 |
+
|
| 162 |
+
g1 = {k: v for k, v in old_groups[1].items() if k != "params"}
|
| 163 |
+
g1["params"] = list(range(new_group0_count, new_group0_count + group1_count))
|
| 164 |
+
new_groups.append(g1)
|
| 165 |
+
|
| 166 |
+
total = new_group0_count + group1_count
|
| 167 |
+
print(f" Total: {len(old_state)} → {total} optimizer params")
|
| 168 |
+
|
| 169 |
+
return {"state": new_state_entries, "param_groups": new_groups}
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def migrate(ckpt_dir: Path) -> None:
|
| 173 |
+
model_path = ckpt_dir / "model.pt"
|
| 174 |
+
opt_path = ckpt_dir / "optimizer.pt"
|
| 175 |
+
|
| 176 |
+
if not model_path.exists():
|
| 177 |
+
raise FileNotFoundError(f"model.pt not found in {ckpt_dir}")
|
| 178 |
+
|
| 179 |
+
# --- Model migration ---
|
| 180 |
+
print(f"[1/2] Migrating model weights from {model_path} ...")
|
| 181 |
+
state = torch.load(model_path, map_location="cpu", weights_only=True)
|
| 182 |
+
|
| 183 |
+
has_old = any(".q_proj." in k for k in state)
|
| 184 |
+
has_new = any(".qkv_proj." in k for k in state)
|
| 185 |
+
|
| 186 |
+
if has_new and not has_old:
|
| 187 |
+
print(" Model already migrated. Skipping.")
|
| 188 |
+
elif has_old:
|
| 189 |
+
new_model_state = migrate_model(state)
|
| 190 |
+
torch.save(new_model_state, model_path)
|
| 191 |
+
print(f" Model saved.")
|
| 192 |
+
else:
|
| 193 |
+
raise RuntimeError("Model state has neither q_proj nor qkv_proj keys!")
|
| 194 |
+
|
| 195 |
+
# --- Optimizer migration ---
|
| 196 |
+
if opt_path.exists():
|
| 197 |
+
print(f"\n[2/2] Migrating optimizer states from {opt_path} ...")
|
| 198 |
+
opt = torch.load(opt_path, map_location="cpu", weights_only=True)
|
| 199 |
+
|
| 200 |
+
# Check if already migrated
|
| 201 |
+
total_params = sum(len(pg["params"]) for pg in opt["param_groups"])
|
| 202 |
+
expected_old = N_LAYERS * OLD_PARAMS_PER_LAYER + 58 # 168 + 58 = 226
|
| 203 |
+
expected_new = N_LAYERS * NEW_PARAMS_PER_LAYER + 58 # 112 + 58 = 170
|
| 204 |
+
|
| 205 |
+
if total_params == expected_old:
|
| 206 |
+
opt_backup = ckpt_dir / "optimizer.pt.backup_pre_qkv"
|
| 207 |
+
if not opt_backup.exists():
|
| 208 |
+
torch.save(opt, opt_backup)
|
| 209 |
+
print(f" Backup: {opt_backup}")
|
| 210 |
+
new_opt = migrate_optimizer(opt)
|
| 211 |
+
torch.save(new_opt, opt_path)
|
| 212 |
+
print(f" Optimizer saved.")
|
| 213 |
+
elif total_params == expected_new:
|
| 214 |
+
print(f" Optimizer already migrated ({total_params} params). Skipping.")
|
| 215 |
+
else:
|
| 216 |
+
print(f" [WARN] Unexpected param count {total_params} "
|
| 217 |
+
f"(expected old={expected_old} or new={expected_new}). "
|
| 218 |
+
f"Deleting optimizer.pt — optimizer will restart fresh.")
|
| 219 |
+
opt_path.unlink()
|
| 220 |
+
else:
|
| 221 |
+
print("\n[2/2] No optimizer.pt found. Optimizer will restart fresh.")
|
| 222 |
+
|
| 223 |
+
print("\nMigration complete!")
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
if __name__ == "__main__":
|
| 227 |
+
if len(sys.argv) != 2:
|
| 228 |
+
print(__doc__)
|
| 229 |
+
sys.exit(1)
|
| 230 |
+
migrate(Path(sys.argv[1]))
|
source/scripts/monitor_3b.sh
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# monitor_3b.sh — 3B 학습 실시간 모니터링 + 이상 감지 + 자동 체크포인트 정리
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/monitor_3b.sh # 기본 감시
|
| 7 |
+
# bash scripts/monitor_3b.sh --check-once # 1회 검사
|
| 8 |
+
# bash scripts/monitor_3b.sh --auto-cleanup # 자동 오래된 체크포인트 삭제
|
| 9 |
+
#
|
| 10 |
+
# 3B 특화 사항:
|
| 11 |
+
# - 체크포인트 27GB/개 → 디스크 감시 강화
|
| 12 |
+
# - NCCL hang 감지 + 자동 재시작 옵션
|
| 13 |
+
# - 예상 완료 시간 실시간 계산
|
| 14 |
+
# - 프로세스 중복 실행 방지
|
| 15 |
+
# =============================================================================
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
# ---- Configuration ----------------------------------------------------------
|
| 19 |
+
RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
|
| 20 |
+
LOG_FILE="${1:-checkpoints/${RUN_NAME}/train.log}"
|
| 21 |
+
CKPT_DIR="checkpoints/${RUN_NAME}"
|
| 22 |
+
CHECK_INTERVAL=60 # 3B는 step 간격 더 김 → 60초
|
| 23 |
+
ZERO_LOSS_THRESHOLD=3
|
| 24 |
+
GNORM_WARN=10.0
|
| 25 |
+
GNORM_CRITICAL=50.0
|
| 26 |
+
LOSS_SPIKE_FACTOR=3.0
|
| 27 |
+
STALL_TIMEOUT=600 # 10분 (3B는 step 더 오래 걸림)
|
| 28 |
+
DISK_WARN_PCT=85
|
| 29 |
+
DISK_CRITICAL_PCT=92
|
| 30 |
+
GPU_UTIL_WARN=50
|
| 31 |
+
MAX_CHECKPOINTS=15 # 최대 보관 체크포인트 수 (15 × 27GB = 405GB)
|
| 32 |
+
CHECK_ONCE=false
|
| 33 |
+
AUTO_CLEANUP=false
|
| 34 |
+
AUTO_RESTART=false
|
| 35 |
+
|
| 36 |
+
# Parse args
|
| 37 |
+
for arg in "$@"; do
|
| 38 |
+
case "$arg" in
|
| 39 |
+
--check-once) CHECK_ONCE=true ;;
|
| 40 |
+
--auto-cleanup) AUTO_CLEANUP=true ;;
|
| 41 |
+
--auto-restart) AUTO_RESTART=true ;;
|
| 42 |
+
esac
|
| 43 |
+
done
|
| 44 |
+
# Fix LOG_FILE if first arg was a flag
|
| 45 |
+
if [[ "$LOG_FILE" == --* ]]; then
|
| 46 |
+
LOG_FILE="checkpoints/${RUN_NAME}/train.log"
|
| 47 |
+
fi
|
| 48 |
+
|
| 49 |
+
# ---- Colors -----------------------------------------------------------------
|
| 50 |
+
RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m'
|
| 51 |
+
CYAN='\033[0;36m'; MAGENTA='\033[0;35m'; NC='\033[0m'
|
| 52 |
+
|
| 53 |
+
timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
|
| 54 |
+
|
| 55 |
+
alert() {
|
| 56 |
+
local level="$1" msg="$2"
|
| 57 |
+
case "$level" in
|
| 58 |
+
CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
|
| 59 |
+
WARNING) echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING] ${msg}${NC}" ;;
|
| 60 |
+
INFO) echo -e "${CYAN}🟡 [$(timestamp)] [INFO] ${msg}${NC}" ;;
|
| 61 |
+
OK) echo -e "${GREEN}✅ [$(timestamp)] [OK] ${msg}${NC}" ;;
|
| 62 |
+
esac
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# ---- Parse metrics ----------------------------------------------------------
|
| 66 |
+
parse_metrics() {
|
| 67 |
+
local n="${1:-20}"
|
| 68 |
+
[[ -f "$LOG_FILE" ]] || return
|
| 69 |
+
tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
extract_field() {
|
| 73 |
+
echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
extract_step() {
|
| 77 |
+
echo "$1" | grep -oP "step\s+\K[0-9]+" | head -1
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
# ---- Check: Loss = 0 -------------------------------------------------------
|
| 81 |
+
check_loss_zero() {
|
| 82 |
+
local lines
|
| 83 |
+
lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
|
| 84 |
+
[[ -z "$lines" ]] && return 0
|
| 85 |
+
local zero_count=0
|
| 86 |
+
while IFS= read -r line; do
|
| 87 |
+
local loss=$(extract_field "$line" "loss")
|
| 88 |
+
if [[ -n "$loss" ]] && (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
|
| 89 |
+
((zero_count++))
|
| 90 |
+
fi
|
| 91 |
+
done <<< "$lines"
|
| 92 |
+
if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
|
| 93 |
+
alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그. 즉시 중단!"
|
| 94 |
+
return 1
|
| 95 |
+
fi
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
# ---- Check: Loss spike -----------------------------------------------------
|
| 99 |
+
check_loss_spike() {
|
| 100 |
+
local lines=$(parse_metrics 20)
|
| 101 |
+
[[ -z "$lines" ]] && return 0
|
| 102 |
+
local losses=()
|
| 103 |
+
while IFS= read -r line; do
|
| 104 |
+
local loss=$(extract_field "$line" "loss")
|
| 105 |
+
[[ -n "$loss" ]] && losses+=("$loss")
|
| 106 |
+
done <<< "$lines"
|
| 107 |
+
local count=${#losses[@]}
|
| 108 |
+
[[ $count -lt 5 ]] && return 0
|
| 109 |
+
local last="${losses[$((count-1))]}"
|
| 110 |
+
local sum=0
|
| 111 |
+
for ((i=0; i<count-1; i++)); do
|
| 112 |
+
sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
|
| 113 |
+
done
|
| 114 |
+
local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
|
| 115 |
+
if [[ "$avg" != "0" ]]; then
|
| 116 |
+
local ratio=$(echo "$last / $avg" | bc -l 2>/dev/null || echo "1")
|
| 117 |
+
if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
|
| 118 |
+
alert WARNING "Loss spike! 현재=${last}, 평균=${avg}, 비율=${ratio}x"
|
| 119 |
+
fi
|
| 120 |
+
fi
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
# ---- Check: Gradient norm ---------------------------------------------------
|
| 124 |
+
check_gnorm() {
|
| 125 |
+
local lines=$(parse_metrics 5)
|
| 126 |
+
[[ -z "$lines" ]] && return 0
|
| 127 |
+
local gnorm=$(extract_field "$(echo "$lines" | tail -1)" "gnorm")
|
| 128 |
+
[[ -z "$gnorm" ]] && return 0
|
| 129 |
+
if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
|
| 130 |
+
alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전."
|
| 131 |
+
elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
|
| 132 |
+
alert WARNING "GNorm=${gnorm} 불안정."
|
| 133 |
+
fi
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# ---- Check: Stall / NCCL hang ----------------------------------------------
|
| 137 |
+
check_stall() {
|
| 138 |
+
[[ ! -f "$LOG_FILE" ]] && return 0
|
| 139 |
+
local last_mod=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
|
| 140 |
+
local now=$(date +%s)
|
| 141 |
+
local diff=$((now - last_mod))
|
| 142 |
+
if [[ $diff -gt $STALL_TIMEOUT ]]; then
|
| 143 |
+
alert CRITICAL "로그 ${diff}초 ($(( diff/60 ))분) 멈춤! NCCL hang 가능성."
|
| 144 |
+
# NCCL hang 자동 재시작
|
| 145 |
+
if $AUTO_RESTART; then
|
| 146 |
+
alert WARNING "자동 재시작 시도..."
|
| 147 |
+
local pid=$(pgrep -f "pretrain.py.*korean_3b" | head -1 || true)
|
| 148 |
+
if [[ -n "$pid" ]]; then
|
| 149 |
+
kill -9 "$pid" 2>/dev/null || true
|
| 150 |
+
sleep 5
|
| 151 |
+
alert INFO "이전 프로세스 종료. launch_3b_pretrain.sh 재실행 필요."
|
| 152 |
+
fi
|
| 153 |
+
fi
|
| 154 |
+
fi
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# ---- Check: Disk (3B 강화) --------------------------------------------------
|
| 158 |
+
check_disk() {
|
| 159 |
+
local usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
|
| 160 |
+
if [[ -n "$usage" && "$usage" -gt "$DISK_CRITICAL_PCT" ]]; then
|
| 161 |
+
alert CRITICAL "디스크 ${usage}% > ${DISK_CRITICAL_PCT}%! 즉시 정리 필요!"
|
| 162 |
+
$AUTO_CLEANUP && cleanup_old_checkpoints
|
| 163 |
+
elif [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
|
| 164 |
+
alert WARNING "디스크 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 권장."
|
| 165 |
+
fi
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
# ---- Check: GPU utilization -------------------------------------------------
|
| 169 |
+
check_gpu() {
|
| 170 |
+
command -v nvidia-smi &>/dev/null || return 0
|
| 171 |
+
local low=0 total=0
|
| 172 |
+
while IFS= read -r util; do
|
| 173 |
+
((total++))
|
| 174 |
+
[[ "$util" -lt "$GPU_UTIL_WARN" ]] && ((low++))
|
| 175 |
+
done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
|
| 176 |
+
[[ $total -gt 0 && $low -gt 0 ]] && alert INFO "${low}/${total} GPU util < ${GPU_UTIL_WARN}%"
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
# ---- Check: 체크포인트 무결성 -----------------------------------------------
|
| 180 |
+
check_checkpoint_integrity() {
|
| 181 |
+
local latest=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
|
| 182 |
+
[[ -z "$latest" ]] && return 0
|
| 183 |
+
# 최소 파일 존재 확인
|
| 184 |
+
if [[ ! -f "${latest}/model.pt" ]] && [[ ! -f "${latest}/model.safetensors" ]]; then
|
| 185 |
+
alert WARNING "최근 체크포인트에 모델 파일 없음: ${latest}"
|
| 186 |
+
fi
|
| 187 |
+
# 크기 확인 (3B model.pt는 최소 2GB)
|
| 188 |
+
local size=$(du -sb "${latest}" 2>/dev/null | awk '{print $1}')
|
| 189 |
+
if [[ -n "$size" && "$size" -lt 2000000000 ]]; then
|
| 190 |
+
alert WARNING "체크포인트 크기 비정상 (${size} bytes < 2GB): ${latest}"
|
| 191 |
+
fi
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
# ---- Cleanup: 오래된 체크포인트 자동 삭제 ------------------------------------
|
| 195 |
+
cleanup_old_checkpoints() {
|
| 196 |
+
local ckpts=($(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V))
|
| 197 |
+
local count=${#ckpts[@]}
|
| 198 |
+
if [[ $count -le $MAX_CHECKPOINTS ]]; then
|
| 199 |
+
alert OK "체크포인트 ${count}개 ≤ ${MAX_CHECKPOINTS}. 정리 불필요."
|
| 200 |
+
return
|
| 201 |
+
fi
|
| 202 |
+
# 이정표 체크포인트 보존 (매 10K step)
|
| 203 |
+
local deletable=()
|
| 204 |
+
local preserved=()
|
| 205 |
+
for ckpt in "${ckpts[@]}"; do
|
| 206 |
+
local step_num=$(basename "$ckpt" | grep -oP '\d+' || echo "0")
|
| 207 |
+
if (( step_num % 10000 == 0 && step_num > 0 )); then
|
| 208 |
+
preserved+=("$ckpt")
|
| 209 |
+
else
|
| 210 |
+
deletable+=("$ckpt")
|
| 211 |
+
fi
|
| 212 |
+
done
|
| 213 |
+
# 최근 MAX_CHECKPOINTS개는 무조건 보존
|
| 214 |
+
local n_deletable=${#deletable[@]}
|
| 215 |
+
local total_keep=$(( ${#preserved[@]} + MAX_CHECKPOINTS ))
|
| 216 |
+
local to_delete=$(( count - total_keep ))
|
| 217 |
+
[[ $to_delete -le 0 ]] && { alert OK "정리 불필요 (이정표 ${#preserved[@]}개 + 최근 ${MAX_CHECKPOINTS}개 보존)."; return; }
|
| 218 |
+
alert INFO "${count}개 체크포인트 → ${to_delete}개 삭제 (이정표 ${#preserved[@]}개 영구 보존)"
|
| 219 |
+
local deleted=0
|
| 220 |
+
for ckpt in "${deletable[@]}"; do
|
| 221 |
+
[[ $deleted -ge $to_delete ]] && break
|
| 222 |
+
local ckpt_size=$(du -sh "$ckpt" 2>/dev/null | awk '{print $1}')
|
| 223 |
+
echo " 삭제: $ckpt (${ckpt_size})"
|
| 224 |
+
rm -rf "$ckpt"
|
| 225 |
+
((deleted++))
|
| 226 |
+
done
|
| 227 |
+
alert OK "체크포인트 정리 완료. (${deleted}개 삭제)"
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
# ---- ETA 계산 ---------------------------------------------------------------
|
| 231 |
+
estimate_eta() {
|
| 232 |
+
[[ ! -f "$LOG_FILE" ]] && return
|
| 233 |
+
# 최근 step 번호 + 시간
|
| 234 |
+
local lines=$(parse_metrics 50)
|
| 235 |
+
[[ -z "$lines" ]] && return
|
| 236 |
+
local last_line=$(echo "$lines" | tail -1)
|
| 237 |
+
local first_line=$(echo "$lines" | head -1)
|
| 238 |
+
local cur_step=$(extract_step "$last_line")
|
| 239 |
+
local max_steps=$(grep -oP "max_steps.*?(\d+)" "${CKPT_DIR}/train.log" 2>/dev/null | head -1 | grep -oP '\d+$' || echo "57000")
|
| 240 |
+
|
| 241 |
+
[[ -z "$cur_step" || "$cur_step" == "0" ]] && return
|
| 242 |
+
|
| 243 |
+
# step/sec from log timestamps (approximate)
|
| 244 |
+
local remaining=$((max_steps - cur_step))
|
| 245 |
+
if [[ $remaining -le 0 ]]; then
|
| 246 |
+
echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (완료!)${NC}"
|
| 247 |
+
return
|
| 248 |
+
fi
|
| 249 |
+
|
| 250 |
+
# 파일 수정 시간 기반 rough ETA
|
| 251 |
+
local first_time=$(head -20 "$LOG_FILE" 2>/dev/null | grep -oP '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}' | head -1 || true)
|
| 252 |
+
if [[ -n "$first_time" ]]; then
|
| 253 |
+
local start_epoch=$(date -d "$first_time" +%s 2>/dev/null || echo 0)
|
| 254 |
+
local now=$(date +%s)
|
| 255 |
+
if [[ $start_epoch -gt 0 && $cur_step -gt 0 ]]; then
|
| 256 |
+
local elapsed=$((now - start_epoch))
|
| 257 |
+
local sec_per_step=$(echo "$elapsed / $cur_step" | bc -l 2>/dev/null || echo "0")
|
| 258 |
+
local eta_sec=$(echo "$remaining * $sec_per_step" | bc 2>/dev/null | cut -d. -f1 || echo "0")
|
| 259 |
+
local eta_hours=$(echo "$eta_sec / 3600" | bc 2>/dev/null || echo "?")
|
| 260 |
+
local pct=$(echo "scale=1; $cur_step * 100 / $max_steps" | bc 2>/dev/null || echo "?")
|
| 261 |
+
echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (${pct}%) | 남은 시간: ~${eta_hours}h | ${sec_per_step}s/step${NC}"
|
| 262 |
+
fi
|
| 263 |
+
else
|
| 264 |
+
echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps}${NC}"
|
| 265 |
+
fi
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
# ---- Status summary ---------------------------------------------------------
|
| 269 |
+
print_status() {
|
| 270 |
+
local lines=$(parse_metrics 1)
|
| 271 |
+
[[ -n "$lines" ]] && echo -e "${GREEN}최근:${NC} $lines"
|
| 272 |
+
estimate_eta
|
| 273 |
+
if command -v nvidia-smi &>/dev/null; then
|
| 274 |
+
echo -e "${CYAN}GPU:${NC}"
|
| 275 |
+
nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu \
|
| 276 |
+
--format=csv,noheader 2>/dev/null | head -8
|
| 277 |
+
fi
|
| 278 |
+
local ckpt_count=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | wc -l)
|
| 279 |
+
local ckpt_size=$(du -sh "${CKPT_DIR}" 2>/dev/null | awk '{print $1}')
|
| 280 |
+
echo -e "${CYAN}체크포인트:${NC} ${ckpt_count}개 (${ckpt_size})"
|
| 281 |
+
local disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print $3"/"$2" ("$5")"}')
|
| 282 |
+
echo -e "${CYAN}디스크:${NC} ${disk}"
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
# ---- Main -------------------------------------------------------------------
|
| 286 |
+
echo "=================================================================="
|
| 287 |
+
echo " 3B Training Monitor"
|
| 288 |
+
echo " Run: ${RUN_NAME}"
|
| 289 |
+
echo " Log: ${LOG_FILE}"
|
| 290 |
+
echo " Interval: ${CHECK_INTERVAL}s"
|
| 291 |
+
echo " Auto-cleanup: ${AUTO_CLEANUP} | Auto-restart: ${AUTO_RESTART}"
|
| 292 |
+
echo " Ctrl+C to stop"
|
| 293 |
+
echo "=================================================================="
|
| 294 |
+
|
| 295 |
+
run_all_checks() {
|
| 296 |
+
check_loss_zero || true
|
| 297 |
+
check_loss_spike || true
|
| 298 |
+
check_gnorm || true
|
| 299 |
+
check_stall || true
|
| 300 |
+
check_disk || true
|
| 301 |
+
check_gpu || true
|
| 302 |
+
check_checkpoint_integrity || true
|
| 303 |
+
echo "---"
|
| 304 |
+
print_status
|
| 305 |
+
echo ""
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
if $CHECK_ONCE; then
|
| 309 |
+
run_all_checks
|
| 310 |
+
exit 0
|
| 311 |
+
fi
|
| 312 |
+
|
| 313 |
+
while true; do
|
| 314 |
+
run_all_checks
|
| 315 |
+
sleep "$CHECK_INTERVAL"
|
| 316 |
+
done
|
source/scripts/monitor_training.sh
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# monitor_training.sh — SFT 학습 실시간 모니터링 + 이상 감지
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/monitor_training.sh # 기본 로그 경로
|
| 7 |
+
# bash scripts/monitor_training.sh /path/to/train.log # 커스텀 경로
|
| 8 |
+
# bash scripts/monitor_training.sh --check-once # 1회 검사 후 종료
|
| 9 |
+
#
|
| 10 |
+
# 감시 항목:
|
| 11 |
+
# 🔴 loss = 0.0000 (3 step 연속) → Labels 버그
|
| 12 |
+
# 🔴 gnorm > 50.0 → 발산 직전
|
| 13 |
+
# 🔴 로그 5분 이상 멈춤 → Hang
|
| 14 |
+
# 🟠 loss spike (3× 이동평균) → Bad batch / LR
|
| 15 |
+
# 🟠 gnorm > 10.0 → 불안정
|
| 16 |
+
# 🟠 디스크 > 80% → 정리 필요
|
| 17 |
+
# 🟡 GPU util < 50% → 병목
|
| 18 |
+
# =============================================================================
|
| 19 |
+
set -euo pipefail
|
| 20 |
+
|
| 21 |
+
# ---- Configuration ----------------------------------------------------------
|
| 22 |
+
LOG_FILE="${1:-checkpoints/korean_1b_sft/train.log}"
|
| 23 |
+
CHECK_INTERVAL=30 # 초 단위 폴링 간격
|
| 24 |
+
ZERO_LOSS_THRESHOLD=3 # N회 연속 loss=0이면 경고
|
| 25 |
+
GNORM_WARN=10.0
|
| 26 |
+
GNORM_CRITICAL=50.0
|
| 27 |
+
LOSS_SPIKE_FACTOR=3.0 # 이동평균 대비 N배 이상이면 spike
|
| 28 |
+
STALL_TIMEOUT=300 # 초 (5분) 로그 멈춤 감지
|
| 29 |
+
DISK_WARN_PCT=80
|
| 30 |
+
GPU_UTIL_WARN=50
|
| 31 |
+
CHECK_ONCE=false
|
| 32 |
+
|
| 33 |
+
if [[ "${1:-}" == "--check-once" ]]; then
|
| 34 |
+
CHECK_ONCE=true
|
| 35 |
+
LOG_FILE="${2:-checkpoints/korean_1b_sft/train.log}"
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
# ---- Colors -----------------------------------------------------------------
|
| 39 |
+
RED='\033[0;31m'
|
| 40 |
+
YELLOW='\033[1;33m'
|
| 41 |
+
GREEN='\033[0;32m'
|
| 42 |
+
CYAN='\033[0;36m'
|
| 43 |
+
NC='\033[0m'
|
| 44 |
+
|
| 45 |
+
# ---- Helper -----------------------------------------------------------------
|
| 46 |
+
timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
|
| 47 |
+
|
| 48 |
+
alert() {
|
| 49 |
+
local level="$1" msg="$2"
|
| 50 |
+
case "$level" in
|
| 51 |
+
CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
|
| 52 |
+
WARNING) echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING] ${msg}${NC}" ;;
|
| 53 |
+
INFO) echo -e "${CYAN}🟡 [$(timestamp)] [INFO] ${msg}${NC}" ;;
|
| 54 |
+
OK) echo -e "${GREEN}✅ [$(timestamp)] [OK] ${msg}${NC}" ;;
|
| 55 |
+
esac
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# ---- Parse last N log lines -------------------------------------------------
|
| 59 |
+
parse_metrics() {
|
| 60 |
+
# 로그 형식: [timestamp] [INFO] step XXXX | loss X.XXXX | lr X.XXe-XX | gnorm X.XXX | ...
|
| 61 |
+
local n="${1:-20}"
|
| 62 |
+
if [[ ! -f "$LOG_FILE" ]]; then
|
| 63 |
+
echo ""
|
| 64 |
+
return
|
| 65 |
+
fi
|
| 66 |
+
tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
extract_field() {
|
| 70 |
+
# $1=line, $2=field name (loss, gnorm, lr)
|
| 71 |
+
echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# ---- Check functions --------------------------------------------------------
|
| 75 |
+
|
| 76 |
+
check_loss_zero() {
|
| 77 |
+
local lines
|
| 78 |
+
lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
|
| 79 |
+
if [[ -z "$lines" ]]; then return; fi
|
| 80 |
+
|
| 81 |
+
local zero_count=0
|
| 82 |
+
while IFS= read -r line; do
|
| 83 |
+
local loss
|
| 84 |
+
loss=$(extract_field "$line" "loss")
|
| 85 |
+
if [[ -n "$loss" ]]; then
|
| 86 |
+
# loss < 0.001
|
| 87 |
+
if (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
|
| 88 |
+
((zero_count++))
|
| 89 |
+
fi
|
| 90 |
+
fi
|
| 91 |
+
done <<< "$lines"
|
| 92 |
+
|
| 93 |
+
if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
|
| 94 |
+
alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그 가능성. 즉시 학습 중단!"
|
| 95 |
+
return 1
|
| 96 |
+
fi
|
| 97 |
+
return 0
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
check_loss_spike() {
|
| 101 |
+
local lines
|
| 102 |
+
lines=$(parse_metrics 20)
|
| 103 |
+
if [[ -z "$lines" ]]; then return 0; fi
|
| 104 |
+
|
| 105 |
+
local losses=()
|
| 106 |
+
while IFS= read -r line; do
|
| 107 |
+
local loss
|
| 108 |
+
loss=$(extract_field "$line" "loss")
|
| 109 |
+
[[ -n "$loss" ]] && losses+=("$loss")
|
| 110 |
+
done <<< "$lines"
|
| 111 |
+
|
| 112 |
+
local count=${#losses[@]}
|
| 113 |
+
if [[ $count -lt 5 ]]; then return 0; fi
|
| 114 |
+
|
| 115 |
+
# 마지막 값과 이전 평균 비교
|
| 116 |
+
local last_loss="${losses[$((count-1))]}"
|
| 117 |
+
local sum=0
|
| 118 |
+
for ((i=0; i<count-1; i++)); do
|
| 119 |
+
sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
|
| 120 |
+
done
|
| 121 |
+
local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
|
| 122 |
+
|
| 123 |
+
if [[ "$avg" != "0" ]]; then
|
| 124 |
+
local ratio=$(echo "$last_loss / $avg" | bc -l 2>/dev/null || echo "1")
|
| 125 |
+
if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
|
| 126 |
+
alert WARNING "Loss spike 감지! 현재=${last_loss}, 평균=${avg}, 비율=${ratio}x"
|
| 127 |
+
fi
|
| 128 |
+
fi
|
| 129 |
+
return 0
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
check_gnorm() {
|
| 133 |
+
local lines
|
| 134 |
+
lines=$(parse_metrics 5)
|
| 135 |
+
if [[ -z "$lines" ]]; then return 0; fi
|
| 136 |
+
|
| 137 |
+
local last_line
|
| 138 |
+
last_line=$(echo "$lines" | tail -1)
|
| 139 |
+
local gnorm
|
| 140 |
+
gnorm=$(extract_field "$last_line" "gnorm")
|
| 141 |
+
|
| 142 |
+
if [[ -z "$gnorm" ]]; then return 0; fi
|
| 143 |
+
|
| 144 |
+
if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
|
| 145 |
+
alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전. 학습 중단 고려."
|
| 146 |
+
elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
|
| 147 |
+
alert WARNING "GNorm=${gnorm} > ${GNORM_WARN}. 불안정 징후."
|
| 148 |
+
fi
|
| 149 |
+
return 0
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
check_stall() {
|
| 153 |
+
if [[ ! -f "$LOG_FILE" ]]; then
|
| 154 |
+
alert INFO "로그 파일 없음: ${LOG_FILE}"
|
| 155 |
+
return 0
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
local last_modified
|
| 159 |
+
last_modified=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
|
| 160 |
+
local now
|
| 161 |
+
now=$(date +%s)
|
| 162 |
+
local diff=$((now - last_modified))
|
| 163 |
+
|
| 164 |
+
if [[ $diff -gt $STALL_TIMEOUT ]]; then
|
| 165 |
+
alert CRITICAL "로그가 ${diff}초 ($(( diff/60 ))분) 동안 업데이트 없음! Hang 가능성."
|
| 166 |
+
fi
|
| 167 |
+
return 0
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
check_disk() {
|
| 171 |
+
local usage
|
| 172 |
+
usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
|
| 173 |
+
if [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
|
| 174 |
+
alert WARNING "디스크 사용률 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 필요."
|
| 175 |
+
fi
|
| 176 |
+
return 0
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
check_gpu() {
|
| 180 |
+
if ! command -v nvidia-smi &>/dev/null; then return 0; fi
|
| 181 |
+
|
| 182 |
+
local low_util=0
|
| 183 |
+
local total_gpus=0
|
| 184 |
+
while IFS= read -r util; do
|
| 185 |
+
((total_gpus++))
|
| 186 |
+
if [[ "$util" -lt "$GPU_UTIL_WARN" ]]; then
|
| 187 |
+
((low_util++))
|
| 188 |
+
fi
|
| 189 |
+
done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
|
| 190 |
+
|
| 191 |
+
if [[ $total_gpus -gt 0 && $low_util -gt 0 ]]; then
|
| 192 |
+
alert INFO "${low_util}/${total_gpus} GPU utilization < ${GPU_UTIL_WARN}%. 데이터 로딩 병목?"
|
| 193 |
+
fi
|
| 194 |
+
return 0
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
# ---- Status summary ---------------------------------------------------------
|
| 198 |
+
print_status() {
|
| 199 |
+
local lines
|
| 200 |
+
lines=$(parse_metrics 1)
|
| 201 |
+
if [[ -n "$lines" ]]; then
|
| 202 |
+
echo -e "${GREEN}최근 로그:${NC} $lines"
|
| 203 |
+
fi
|
| 204 |
+
|
| 205 |
+
if command -v nvidia-smi &>/dev/null; then
|
| 206 |
+
echo -e "${CYAN}GPU 메모리:${NC}"
|
| 207 |
+
nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu \
|
| 208 |
+
--format=csv,noheader 2>/dev/null | head -8
|
| 209 |
+
fi
|
| 210 |
+
|
| 211 |
+
local disk
|
| 212 |
+
disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print "사용: "$3"/"$2" ("$5")"}')
|
| 213 |
+
echo -e "${CYAN}디스크:${NC} ${disk}"
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
# ---- Main loop --------------------------------------------------------------
|
| 217 |
+
echo "=================================================================="
|
| 218 |
+
echo " SFT Training Monitor"
|
| 219 |
+
echo " Log file: ${LOG_FILE}"
|
| 220 |
+
echo " Check interval: ${CHECK_INTERVAL}s"
|
| 221 |
+
echo " Press Ctrl+C to stop"
|
| 222 |
+
echo "=================================================================="
|
| 223 |
+
|
| 224 |
+
run_all_checks() {
|
| 225 |
+
check_loss_zero || true
|
| 226 |
+
check_loss_spike || true
|
| 227 |
+
check_gnorm || true
|
| 228 |
+
check_stall || true
|
| 229 |
+
check_disk || true
|
| 230 |
+
check_gpu || true
|
| 231 |
+
echo "---"
|
| 232 |
+
print_status
|
| 233 |
+
echo ""
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
if $CHECK_ONCE; then
|
| 237 |
+
run_all_checks
|
| 238 |
+
exit 0
|
| 239 |
+
fi
|
| 240 |
+
|
| 241 |
+
while true; do
|
| 242 |
+
run_all_checks
|
| 243 |
+
sleep "$CHECK_INTERVAL"
|
| 244 |
+
done
|
source/scripts/openclaw-watchdog.sh
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# openclaw-watchdog.sh — OpenClaw Gateway 헬스체크 + 자동 재시작
|
| 4 |
+
# crontab에 등록하여 1분마다 실행
|
| 5 |
+
#
|
| 6 |
+
# Usage:
|
| 7 |
+
# */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/openclaw-watchdog.sh
|
| 8 |
+
#
|
| 9 |
+
# 변경이력:
|
| 10 |
+
# 2026-03-01 네트워크 체크를 ICMP→HTTP로 변경 (ICMP 차단 환경 대응)
|
| 11 |
+
# 다중 엔드포인트 fallback, 게이트웨이 HTTP 응답 체크 추가
|
| 12 |
+
# setsid 분리 실행, 상세 로깅 강화
|
| 13 |
+
|
| 14 |
+
set -euo pipefail
|
| 15 |
+
|
| 16 |
+
# ── 설정 ──────────────────────────────────────────────
|
| 17 |
+
RNTIER_HOME="REDACTED_RNTIER_PATH"
|
| 18 |
+
OPENCLAW_BIN="${RNTIER_HOME}/.npm-global/bin/openclaw"
|
| 19 |
+
GATEWAY_PORT=18789
|
| 20 |
+
GATEWAY_HOST="127.0.0.1"
|
| 21 |
+
PID_FILE="/tmp/openclaw-gateway.pid"
|
| 22 |
+
LOG_DIR="/tmp/openclaw"
|
| 23 |
+
LOG_FILE="${LOG_DIR}/watchdog.log"
|
| 24 |
+
GATEWAY_LOG="${LOG_DIR}/gateway.log"
|
| 25 |
+
MAX_LOG_SIZE=$((10 * 1024 * 1024)) # 10MB 로테이션
|
| 26 |
+
RESTART_COOLDOWN=120 # 초 — 재시작 후 이 시간 내 재시도 방지
|
| 27 |
+
LAST_RESTART_FILE="/tmp/openclaw-last-restart"
|
| 28 |
+
CONSECUTIVE_FAIL_FILE="/tmp/openclaw-consecutive-fails"
|
| 29 |
+
|
| 30 |
+
# 환경변수 — openclaw가 config를 찾을 수 있도록
|
| 31 |
+
export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
|
| 32 |
+
export HOME="/home/ghong"
|
| 33 |
+
export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
|
| 34 |
+
export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
|
| 35 |
+
|
| 36 |
+
# ── 함수 ──────────────────────────────────────────────
|
| 37 |
+
mkdir -p "$LOG_DIR"
|
| 38 |
+
|
| 39 |
+
log() {
|
| 40 |
+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
rotate_log() {
|
| 44 |
+
local file="$1"
|
| 45 |
+
if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
|
| 46 |
+
mv "$file" "${file}.old"
|
| 47 |
+
log "Log rotated: $file"
|
| 48 |
+
fi
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# 게이트웨이의 실제 엔드포인트로 로컬 HTTP 응답 체크
|
| 52 |
+
check_gateway_http() {
|
| 53 |
+
if command -v curl &>/dev/null; then
|
| 54 |
+
curl -sf --max-time 5 -o /dev/null "http://${GATEWAY_HOST}:${GATEWAY_PORT}/__openclaw__/canvas/" 2>/dev/null
|
| 55 |
+
return $?
|
| 56 |
+
fi
|
| 57 |
+
return 1
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
is_port_open() {
|
| 61 |
+
if command -v ss &>/dev/null; then
|
| 62 |
+
ss -tlnH "sport = :${GATEWAY_PORT}" 2>/dev/null | grep -q "$GATEWAY_PORT"
|
| 63 |
+
else
|
| 64 |
+
(echo > /dev/tcp/"$GATEWAY_HOST"/"$GATEWAY_PORT") 2>/dev/null
|
| 65 |
+
fi
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
is_process_alive() {
|
| 69 |
+
if [[ -f "$PID_FILE" ]]; then
|
| 70 |
+
local pid
|
| 71 |
+
pid=$(cat "$PID_FILE" 2>/dev/null)
|
| 72 |
+
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
|
| 73 |
+
return 0
|
| 74 |
+
fi
|
| 75 |
+
fi
|
| 76 |
+
pgrep -f "openclaw.*gateway" >/dev/null 2>&1
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# 네트워크 체크 — DNS 해석 기반
|
| 80 |
+
# 이 서버는 ICMP(ping)과 아웃바운드 HTTPS(curl)가 모두 차단됨.
|
| 81 |
+
# 단, DNS 해석은 가능하고 게이트웨이(Node.js)는 long-polling으로 통신 가능.
|
| 82 |
+
# 따라서 DNS 해석 성공 여부로 "네트워크 자체가 살아있는지" 판단한다.
|
| 83 |
+
check_network() {
|
| 84 |
+
# 방법1: getent (가장 빠르고 가벼움)
|
| 85 |
+
if command -v getent &>/dev/null; then
|
| 86 |
+
getent hosts api.telegram.org >/dev/null 2>&1 && return 0
|
| 87 |
+
getent hosts api.anthropic.com >/dev/null 2>&1 && return 0
|
| 88 |
+
fi
|
| 89 |
+
# 방법2: nslookup
|
| 90 |
+
if command -v nslookup &>/dev/null; then
|
| 91 |
+
nslookup -timeout=5 api.telegram.org >/dev/null 2>&1 && return 0
|
| 92 |
+
fi
|
| 93 |
+
# 방법3: /dev/tcp 로 DNS 서버(168.126.63.1) 포트 53 확인
|
| 94 |
+
(echo > /dev/tcp/168.126.63.1/53) 2>/dev/null && return 0
|
| 95 |
+
return 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
cooldown_active() {
|
| 99 |
+
if [[ -f "$LAST_RESTART_FILE" ]]; then
|
| 100 |
+
local last_restart now diff
|
| 101 |
+
last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
|
| 102 |
+
now=$(date +%s)
|
| 103 |
+
diff=$(( now - last_restart ))
|
| 104 |
+
if [[ $diff -lt $RESTART_COOLDOWN ]]; then
|
| 105 |
+
return 0 # 쿨다운 중
|
| 106 |
+
fi
|
| 107 |
+
fi
|
| 108 |
+
return 1 # 쿨다운 아님
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
get_consecutive_fails() {
|
| 112 |
+
if [[ -f "$CONSECUTIVE_FAIL_FILE" ]]; then
|
| 113 |
+
cat "$CONSECUTIVE_FAIL_FILE" 2>/dev/null || echo 0
|
| 114 |
+
else
|
| 115 |
+
echo 0
|
| 116 |
+
fi
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
set_consecutive_fails() {
|
| 120 |
+
echo "$1" > "$CONSECUTIVE_FAIL_FILE"
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
start_gateway() {
|
| 124 |
+
log "ACTION: Starting OpenClaw gateway on port $GATEWAY_PORT..."
|
| 125 |
+
|
| 126 |
+
# 기존 좀비 프로세스 정리
|
| 127 |
+
local old_pids
|
| 128 |
+
old_pids=$(pgrep -f "openclaw.*gateway" 2>/dev/null || true)
|
| 129 |
+
if [[ -n "$old_pids" ]]; then
|
| 130 |
+
log "ACTION: Killing stale gateway processes: $old_pids"
|
| 131 |
+
echo "$old_pids" | xargs kill -9 2>/dev/null || true
|
| 132 |
+
sleep 2
|
| 133 |
+
fi
|
| 134 |
+
|
| 135 |
+
# 게이트웨이 시작 — setsid로 완전 분리 (부모 프로세스 시그널 전파 방지)
|
| 136 |
+
setsid nohup "$OPENCLAW_BIN" gateway run \
|
| 137 |
+
--port "$GATEWAY_PORT" \
|
| 138 |
+
--bind loopback \
|
| 139 |
+
>> "$GATEWAY_LOG" 2>&1 < /dev/null &
|
| 140 |
+
|
| 141 |
+
local new_pid=$!
|
| 142 |
+
echo "$new_pid" > "$PID_FILE"
|
| 143 |
+
date +%s > "$LAST_RESTART_FILE"
|
| 144 |
+
|
| 145 |
+
log "ACTION: Gateway launched with PID $new_pid (setsid)"
|
| 146 |
+
|
| 147 |
+
# 8초 대기 후 확인 (Telegram provider 초기화에 시간 필요)
|
| 148 |
+
sleep 8
|
| 149 |
+
if kill -0 "$new_pid" 2>/dev/null; then
|
| 150 |
+
log "OK: Gateway PID $new_pid is alive after startup"
|
| 151 |
+
if is_port_open; then
|
| 152 |
+
log "OK: Port $GATEWAY_PORT is listening"
|
| 153 |
+
else
|
| 154 |
+
log "WARN: Gateway alive but port $GATEWAY_PORT not yet listening (may need more time)"
|
| 155 |
+
fi
|
| 156 |
+
return 0
|
| 157 |
+
else
|
| 158 |
+
log "ERROR: Gateway PID $new_pid died immediately after start"
|
| 159 |
+
log "ERROR: Last 10 lines of gateway.log:"
|
| 160 |
+
tail -10 "$GATEWAY_LOG" 2>/dev/null | while read -r line; do
|
| 161 |
+
log " | $line"
|
| 162 |
+
done
|
| 163 |
+
return 1
|
| 164 |
+
fi
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
# ── 메인 로직 ─────────────────────────────────────────
|
| 168 |
+
rotate_log "$LOG_FILE"
|
| 169 |
+
rotate_log "$GATEWAY_LOG"
|
| 170 |
+
|
| 171 |
+
# 오래된 openclaw 로그 파일 정리 (7일 이상)
|
| 172 |
+
find "$LOG_DIR" -name "openclaw-*.log" -mtime +7 -delete 2>/dev/null || true
|
| 173 |
+
|
| 174 |
+
# 1) 프로세스 + 포트 체크를 먼저 수행 (게이트웨이가 살아있으면 네트워크 체크 불필요)
|
| 175 |
+
process_ok=false
|
| 176 |
+
port_ok=false
|
| 177 |
+
http_ok=false
|
| 178 |
+
|
| 179 |
+
if is_process_alive; then
|
| 180 |
+
process_ok=true
|
| 181 |
+
fi
|
| 182 |
+
|
| 183 |
+
if is_port_open; then
|
| 184 |
+
port_ok=true
|
| 185 |
+
fi
|
| 186 |
+
|
| 187 |
+
if $port_ok && check_gateway_http; then
|
| 188 |
+
http_ok=true
|
| 189 |
+
fi
|
| 190 |
+
|
| 191 |
+
# 2) 게이트웨이 정상이면 바로 종료
|
| 192 |
+
if $process_ok && $port_ok; then
|
| 193 |
+
if $http_ok; then
|
| 194 |
+
# 완전 정상
|
| 195 |
+
set_consecutive_fails 0
|
| 196 |
+
exit 0
|
| 197 |
+
fi
|
| 198 |
+
# 프로세스+포트 OK인데 HTTP 응답 없음 → hung 가능성
|
| 199 |
+
fails=$(get_consecutive_fails)
|
| 200 |
+
fails=$((fails + 1))
|
| 201 |
+
set_consecutive_fails "$fails"
|
| 202 |
+
log "WARN: Process alive, port open, but HTTP not responding (consecutive: $fails)"
|
| 203 |
+
if [[ $fails -lt 3 ]]; then
|
| 204 |
+
log "INFO: Waiting more cycles before restart (transient check, $fails/3)"
|
| 205 |
+
exit 0
|
| 206 |
+
fi
|
| 207 |
+
log "WARN: HTTP unresponsive for $fails consecutive checks — proceeding to restart"
|
| 208 |
+
fi
|
| 209 |
+
|
| 210 |
+
# 3) 게이트웨이가 비정상 — 네트워크 체크 후 재시작 여부 판단
|
| 211 |
+
if $process_ok && ! $port_ok; then
|
| 212 |
+
log "WARN: Process alive but port $GATEWAY_PORT not listening. Possible hung state."
|
| 213 |
+
fi
|
| 214 |
+
|
| 215 |
+
if ! $process_ok && ! $port_ok; then
|
| 216 |
+
log "WARN: Gateway is completely down (no process, no port)."
|
| 217 |
+
fi
|
| 218 |
+
|
| 219 |
+
if ! $process_ok && $port_ok; then
|
| 220 |
+
log "WARN: No known gateway process but port $GATEWAY_PORT is in use. Stale process?"
|
| 221 |
+
fi
|
| 222 |
+
|
| 223 |
+
# 4) 네트워크 체크 — DNS 기반 (게이트웨이가 죽었을 때만 실행)
|
| 224 |
+
if ! check_network; then
|
| 225 |
+
log "WARN: Network unreachable (DNS resolution failed). Skipping gateway restart."
|
| 226 |
+
exit 0
|
| 227 |
+
fi
|
| 228 |
+
|
| 229 |
+
# 5) 쿨다운 체크
|
| 230 |
+
if cooldown_active; then
|
| 231 |
+
log "INFO: Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
|
| 232 |
+
exit 0
|
| 233 |
+
fi
|
| 234 |
+
|
| 235 |
+
# 6) 재시작
|
| 236 |
+
log "ACTION: Attempting gateway restart..."
|
| 237 |
+
if start_gateway; then
|
| 238 |
+
log "OK: Gateway restart SUCCESS"
|
| 239 |
+
set_consecutive_fails 0
|
| 240 |
+
else
|
| 241 |
+
log "ERROR: Gateway restart FAILED"
|
| 242 |
+
exit 1
|
| 243 |
+
fi
|
source/scripts/orpo_eval_watchdog.sh
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# ORPO Training Completion Watchdog
|
| 4 |
+
# =============================================================================
|
| 5 |
+
# Monitors the ORPO training process. When it finishes, automatically launches
|
| 6 |
+
# the comprehensive evaluation pipeline.
|
| 7 |
+
#
|
| 8 |
+
# Usage:
|
| 9 |
+
# nohup bash scripts/orpo_eval_watchdog.sh > checkpoints/korean_3b_orpo_v1/watchdog.log 2>&1 &
|
| 10 |
+
# =============================================================================
|
| 11 |
+
|
| 12 |
+
set -euo pipefail
|
| 13 |
+
|
| 14 |
+
PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
|
| 15 |
+
TRAIN_LOG="${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/train.log"
|
| 16 |
+
TRAIN_PID=$(pgrep -f "train/orpo.py.*korean_3b_orpo_v1" | head -1)
|
| 17 |
+
|
| 18 |
+
echo "=============================================="
|
| 19 |
+
echo " ORPO Eval Watchdog Started"
|
| 20 |
+
echo "=============================================="
|
| 21 |
+
echo " Time : $(date '+%Y-%m-%d %H:%M:%S')"
|
| 22 |
+
echo " Train PID : ${TRAIN_PID:-NOT FOUND}"
|
| 23 |
+
echo " Train Log : ${TRAIN_LOG}"
|
| 24 |
+
echo "=============================================="
|
| 25 |
+
|
| 26 |
+
if [ -z "${TRAIN_PID}" ]; then
|
| 27 |
+
echo "[WARN] Training process not found. Checking if already completed..."
|
| 28 |
+
# Check if training already finished by looking for final output
|
| 29 |
+
if grep -q "Training completed" "${TRAIN_LOG}" 2>/dev/null || \
|
| 30 |
+
grep -q "Saving model checkpoint" "${TRAIN_LOG}" 2>/dev/null; then
|
| 31 |
+
echo "[INFO] Training appears to have already completed."
|
| 32 |
+
else
|
| 33 |
+
echo "[ERROR] No training process and no completion marker found. Exiting."
|
| 34 |
+
exit 1
|
| 35 |
+
fi
|
| 36 |
+
else
|
| 37 |
+
echo "[INFO] Watching training PID ${TRAIN_PID}..."
|
| 38 |
+
echo ""
|
| 39 |
+
|
| 40 |
+
# Poll every 60 seconds
|
| 41 |
+
while kill -0 "${TRAIN_PID}" 2>/dev/null; do
|
| 42 |
+
# Get current step
|
| 43 |
+
CURRENT_STEP=$(grep -oP '\d+/9840' "${TRAIN_LOG}" 2>/dev/null | tail -1 || echo "?/?")
|
| 44 |
+
LATEST_LOSS=$(grep "'loss':" "${TRAIN_LOG}" 2>/dev/null | tail -1 | grep -oP "'loss': '([^']+)'" | sed "s/'loss': '//;s/'//" || echo "?")
|
| 45 |
+
echo "[$(date '+%H:%M:%S')] Step ${CURRENT_STEP} | Loss: ${LATEST_LOSS} | PID ${TRAIN_PID} running"
|
| 46 |
+
sleep 60
|
| 47 |
+
done
|
| 48 |
+
|
| 49 |
+
echo ""
|
| 50 |
+
echo "=============================================="
|
| 51 |
+
echo "[INFO] Training process ${TRAIN_PID} has ended."
|
| 52 |
+
echo "[INFO] Time: $(date '+%Y-%m-%d %H:%M:%S')"
|
| 53 |
+
echo "=============================================="
|
| 54 |
+
fi
|
| 55 |
+
|
| 56 |
+
# Wait a moment for any final I/O
|
| 57 |
+
sleep 10
|
| 58 |
+
|
| 59 |
+
# Get final training stats
|
| 60 |
+
echo ""
|
| 61 |
+
echo "[INFO] Final training stats:"
|
| 62 |
+
grep "eval_loss" "${TRAIN_LOG}" | tail -1 | tr ',' '\n' | head -10
|
| 63 |
+
echo ""
|
| 64 |
+
|
| 65 |
+
# Detect the latest checkpoint
|
| 66 |
+
LATEST_CKPT=$(ls -d ${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/checkpoint-* 2>/dev/null | sort -t- -k2 -n | tail -1)
|
| 67 |
+
echo "[INFO] Latest checkpoint: ${LATEST_CKPT}"
|
| 68 |
+
|
| 69 |
+
if [ -z "${LATEST_CKPT}" ]; then
|
| 70 |
+
echo "[ERROR] No checkpoint found. Cannot proceed with evaluation."
|
| 71 |
+
exit 1
|
| 72 |
+
fi
|
| 73 |
+
|
| 74 |
+
# Send telegram notification (if available)
|
| 75 |
+
python3 -c "
|
| 76 |
+
import os, urllib.request, urllib.parse, json
|
| 77 |
+
token = os.environ.get('TELEGRAM_BOT_TOKEN', '')
|
| 78 |
+
chat_id = os.environ.get('TELEGRAM_CHAT_ID', '')
|
| 79 |
+
if token and chat_id:
|
| 80 |
+
msg = '🏁 ORPO 학습 완료! 자동 평가 시작합니다.\nCheckpoint: ${LATEST_CKPT##*/}'
|
| 81 |
+
url = f'https://api.telegram.org/bot{token}/sendMessage'
|
| 82 |
+
data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode()
|
| 83 |
+
urllib.request.urlopen(url, data, timeout=10)
|
| 84 |
+
print('[INFO] Telegram notification sent.')
|
| 85 |
+
else:
|
| 86 |
+
print('[INFO] Telegram not configured, skipping notification.')
|
| 87 |
+
" 2>/dev/null || true
|
| 88 |
+
|
| 89 |
+
# ============================================================================
|
| 90 |
+
# Launch evaluation pipeline
|
| 91 |
+
# ============================================================================
|
| 92 |
+
echo ""
|
| 93 |
+
echo "=============================================="
|
| 94 |
+
echo " Starting ORPO Evaluation Pipeline"
|
| 95 |
+
echo " Time: $(date '+%Y-%m-%d %H:%M:%S')"
|
| 96 |
+
echo "=============================================="
|
| 97 |
+
|
| 98 |
+
cd "${PROJECT_ROOT}"
|
| 99 |
+
|
| 100 |
+
python3 eval/orpo_eval_pipeline.py \
|
| 101 |
+
--checkpoint "${LATEST_CKPT}" \
|
| 102 |
+
2>&1 | tee -a checkpoints/korean_3b_orpo_v1/eval.log
|
| 103 |
+
|
| 104 |
+
EVAL_EXIT=$?
|
| 105 |
+
|
| 106 |
+
echo ""
|
| 107 |
+
echo "=============================================="
|
| 108 |
+
echo " Evaluation Complete"
|
| 109 |
+
echo " Exit code: ${EVAL_EXIT}"
|
| 110 |
+
echo " Time: $(date '+%Y-%m-%d %H:%M:%S')"
|
| 111 |
+
echo "=============================================="
|
| 112 |
+
|
| 113 |
+
# Send completion notification
|
| 114 |
+
python3 -c "
|
| 115 |
+
import os, urllib.request, urllib.parse
|
| 116 |
+
token = os.environ.get('TELEGRAM_BOT_TOKEN', '')
|
| 117 |
+
chat_id = os.environ.get('TELEGRAM_CHAT_ID', '')
|
| 118 |
+
if token and chat_id:
|
| 119 |
+
exit_code = ${EVAL_EXIT}
|
| 120 |
+
status = '✅ 성공' if exit_code == 0 else '❌ 실패'
|
| 121 |
+
msg = f'ORPO 평가 완료: {status}\nExit code: {exit_code}\n보고서: reports/ 확인'
|
| 122 |
+
url = f'https://api.telegram.org/bot{token}/sendMessage'
|
| 123 |
+
data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode()
|
| 124 |
+
urllib.request.urlopen(url, data, timeout=10)
|
| 125 |
+
" 2>/dev/null || true
|
| 126 |
+
|
| 127 |
+
exit ${EVAL_EXIT}
|
source/scripts/orpo_hp_sweep.sh
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# orpo_hp_sweep.sh — ORPO Hyperparameter Sweep (200 steps each)
|
| 4 |
+
#
|
| 5 |
+
# 각 설정을 200 steps씩 돌려서 최적 조합을 찾는 스크립트.
|
| 6 |
+
# 결과는 sweep_results/ 디렉토리에 저장됨.
|
| 7 |
+
#
|
| 8 |
+
# Usage:
|
| 9 |
+
# bash scripts/orpo_hp_sweep.sh # 전체 sweep (6 runs)
|
| 10 |
+
# bash scripts/orpo_hp_sweep.sh --dry-run # 설정만 출력
|
| 11 |
+
# =============================================================================
|
| 12 |
+
set -uo pipefail
|
| 13 |
+
# NOTE: set +e — individual runs may fail; we log failures and continue the sweep
|
| 14 |
+
|
| 15 |
+
cd "$(dirname "$0")/.."
|
| 16 |
+
|
| 17 |
+
SWEEP_STEPS=200
|
| 18 |
+
SWEEP_DIR="checkpoints/orpo_sweep"
|
| 19 |
+
RESULTS_FILE="${SWEEP_DIR}/sweep_results.jsonl"
|
| 20 |
+
BASE_MODEL="eval/outputs/hf_3b_sft_best"
|
| 21 |
+
DATA_PATH="data/preference/combined_preference.jsonl"
|
| 22 |
+
NPROC=8
|
| 23 |
+
MASTER_PORT_BASE=29510
|
| 24 |
+
|
| 25 |
+
# B200 NCCL tuning (NVSwitch mesh — let NCCL auto-detect proto/channels/algo)
|
| 26 |
+
export NCCL_IB_DISABLE=1
|
| 27 |
+
export NCCL_BUFFSIZE=134217728
|
| 28 |
+
export OMP_NUM_THREADS=9
|
| 29 |
+
export MKL_NUM_THREADS=9
|
| 30 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 31 |
+
export NCCL_P2P_LEVEL=NVL
|
| 32 |
+
export PYTHONWARNINGS="ignore::UserWarning:torch.library"
|
| 33 |
+
|
| 34 |
+
mkdir -p "${SWEEP_DIR}"
|
| 35 |
+
declare -a FAILED_RUNS=()
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Sweep configurations: (name, beta, lr, max_length, batch_size, grad_accum)
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# 핵심 탐색 축:
|
| 41 |
+
# 1. beta: 반복 억제 강도 (0.15 vs 0.25 vs 0.35)
|
| 42 |
+
# 2. lr: 수렴 속도 (5e-6 vs 8e-6 vs 1.2e-5)
|
| 43 |
+
# 3. max_length: VRAM vs 커버리지 (1024 vs 1536)
|
| 44 |
+
|
| 45 |
+
declare -a CONFIGS=(
|
| 46 |
+
# name beta lr max_len bs accum
|
| 47 |
+
"baseline_b015_lr8e6 0.15 8e-6 1536 4 4"
|
| 48 |
+
"baseline_b025_lr8e6 0.25 8e-6 1536 4 4"
|
| 49 |
+
"strong_b035_lr8e6 0.35 8e-6 1536 4 4"
|
| 50 |
+
"fast_b025_lr12e6 0.25 1.2e-5 1536 4 4"
|
| 51 |
+
"conserv_b025_lr5e6 0.25 5e-6 1536 4 4"
|
| 52 |
+
"short_b025_lr8e6 0.25 8e-6 1024 4 4"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
DRY_RUN=false
|
| 56 |
+
if [[ "${1:-}" == "--dry-run" ]]; then
|
| 57 |
+
DRY_RUN=true
|
| 58 |
+
fi
|
| 59 |
+
|
| 60 |
+
echo "=================================================================="
|
| 61 |
+
echo " ORPO Hyperparameter Sweep"
|
| 62 |
+
echo " Configs: ${#CONFIGS[@]}"
|
| 63 |
+
echo " Steps each: ${SWEEP_STEPS}"
|
| 64 |
+
echo " Results: ${RESULTS_FILE}"
|
| 65 |
+
echo "=================================================================="
|
| 66 |
+
|
| 67 |
+
for i in "${!CONFIGS[@]}"; do
|
| 68 |
+
read -r NAME BETA LR MAX_LEN BS ACCUM <<< "${CONFIGS[$i]}"
|
| 69 |
+
PORT=$((MASTER_PORT_BASE + i))
|
| 70 |
+
OUTPUT="${SWEEP_DIR}/${NAME}"
|
| 71 |
+
|
| 72 |
+
echo ""
|
| 73 |
+
echo "--- Run $((i+1))/${#CONFIGS[@]}: ${NAME} ---"
|
| 74 |
+
echo " beta=${BETA} lr=${LR} max_length=${MAX_LEN} bs=${BS} accum=${ACCUM}"
|
| 75 |
+
|
| 76 |
+
if [[ "${DRY_RUN}" == "true" ]]; then
|
| 77 |
+
echo " [DRY RUN] skipping"
|
| 78 |
+
continue
|
| 79 |
+
fi
|
| 80 |
+
|
| 81 |
+
mkdir -p "${OUTPUT}"
|
| 82 |
+
START_TIME=$(date +%s)
|
| 83 |
+
|
| 84 |
+
torchrun \
|
| 85 |
+
--nproc_per_node=${NPROC} \
|
| 86 |
+
--master_port=${PORT} \
|
| 87 |
+
train/orpo.py \
|
| 88 |
+
--model_path "${BASE_MODEL}" \
|
| 89 |
+
--custom_data_path "${DATA_PATH}" \
|
| 90 |
+
--output_dir "${OUTPUT}" \
|
| 91 |
+
--max_steps ${SWEEP_STEPS} \
|
| 92 |
+
--lr ${LR} \
|
| 93 |
+
--beta ${BETA} \
|
| 94 |
+
--batch_size ${BS} \
|
| 95 |
+
--gradient_accumulation_steps ${ACCUM} \
|
| 96 |
+
--max_length ${MAX_LEN} \
|
| 97 |
+
\
|
| 98 |
+
--weight_decay 0.01 \
|
| 99 |
+
--warmup_ratio 0.05 \
|
| 100 |
+
--eval_split_ratio 0.05 \
|
| 101 |
+
--eval_steps 100 \
|
| 102 |
+
--early_stopping_patience 100 \
|
| 103 |
+
--save_steps 200 \
|
| 104 |
+
--save_total_limit 1 \
|
| 105 |
+
--logging_steps 10 \
|
| 106 |
+
--report_to none \
|
| 107 |
+
--dataset_num_proc 64 \
|
| 108 |
+
--dataloader_num_workers 4 \
|
| 109 |
+
--no_load_best \
|
| 110 |
+
2>&1 | tee "${OUTPUT}/train.log"
|
| 111 |
+
RUN_EXIT=$?
|
| 112 |
+
|
| 113 |
+
END_TIME=$(date +%s)
|
| 114 |
+
ELAPSED=$((END_TIME - START_TIME))
|
| 115 |
+
|
| 116 |
+
if [[ ${RUN_EXIT} -ne 0 ]]; then
|
| 117 |
+
echo " [ERROR] Run ${NAME} failed with exit code ${RUN_EXIT} after ${ELAPSED}s"
|
| 118 |
+
echo "{\"name\":\"${NAME}\",\"beta\":${BETA},\"lr\":\"${LR}\",\"max_length\":${MAX_LEN},\"status\":\"FAILED\",\"exit_code\":${RUN_EXIT},\"elapsed_s\":${ELAPSED}}" >> "${RESULTS_FILE}"
|
| 119 |
+
FAILED_RUNS+=("${NAME}")
|
| 120 |
+
continue
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
# Extract final metrics from log
|
| 124 |
+
FINAL_LOSS=$(grep -oP "'loss': '[\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[\d.]+" || echo "N/A")
|
| 125 |
+
EVAL_LOSS=$(grep -oP "'eval_loss': '[\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[\d.]+" || echo "N/A")
|
| 126 |
+
MARGIN=$(grep -oP "'rewards/margins': '[-\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[-\d.]+" || echo "N/A")
|
| 127 |
+
|
| 128 |
+
# Save result
|
| 129 |
+
echo "{\"name\":\"${NAME}\",\"beta\":${BETA},\"lr\":\"${LR}\",\"max_length\":${MAX_LEN},\"status\":\"OK\",\"loss\":\"${FINAL_LOSS}\",\"eval_loss\":\"${EVAL_LOSS}\",\"margin\":\"${MARGIN}\",\"elapsed_s\":${ELAPSED}}" >> "${RESULTS_FILE}"
|
| 130 |
+
|
| 131 |
+
echo " -> loss=${FINAL_LOSS} eval_loss=${EVAL_LOSS} margin=${MARGIN} time=${ELAPSED}s"
|
| 132 |
+
|
| 133 |
+
# Cleanup weights to save disk (keep logs)
|
| 134 |
+
rm -rf "${OUTPUT}/checkpoint-"* "${OUTPUT}/emergency_checkpoint" 2>/dev/null || true
|
| 135 |
+
done
|
| 136 |
+
|
| 137 |
+
echo ""
|
| 138 |
+
echo "=================================================================="
|
| 139 |
+
echo " Sweep Complete!"
|
| 140 |
+
echo " Results: ${RESULTS_FILE}"
|
| 141 |
+
if [[ -f "${RESULTS_FILE}" ]]; then
|
| 142 |
+
echo ""
|
| 143 |
+
echo " Summary:"
|
| 144 |
+
cat "${RESULTS_FILE}" | python3 -c "
|
| 145 |
+
import sys, json
|
| 146 |
+
results = [json.loads(l) for l in sys.stdin]
|
| 147 |
+
results.sort(key=lambda r: float(r.get('eval_loss', '999')))
|
| 148 |
+
print(f' {\"Name\":<25} {\"Beta\":>6} {\"LR\":>10} {\"Loss\":>8} {\"EvalLoss\":>10} {\"Margin\":>8} {\"Time\":>6}')
|
| 149 |
+
print(f' {\"-\"*25} {\"-\"*6} {\"-\"*10} {\"-\"*8} {\"-\"*10} {\"-\"*8} {\"-\"*6}')
|
| 150 |
+
for r in results:
|
| 151 |
+
print(f' {r[\"name\"]:<25} {r[\"beta\"]:>6} {r[\"lr\"]:>10} {r[\"loss\"]:>8} {r[\"eval_loss\"]:>10} {r[\"margin\"]:>8} {r[\"elapsed_s\"]:>5}s')
|
| 152 |
+
print()
|
| 153 |
+
best = results[0]
|
| 154 |
+
print(f' BEST: {best[\"name\"]} (eval_loss={best[\"eval_loss\"]})')
|
| 155 |
+
" 2>/dev/null || cat "${RESULTS_FILE}"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
# Report failed runs
|
| 159 |
+
if [[ ${#FAILED_RUNS[@]} -gt 0 ]]; then
|
| 160 |
+
echo ""
|
| 161 |
+
echo " FAILED RUNS (${#FAILED_RUNS[@]}):"
|
| 162 |
+
for fname in "${FAILED_RUNS[@]}"; do
|
| 163 |
+
echo " - ${fname}"
|
| 164 |
+
done
|
| 165 |
+
fi
|
| 166 |
+
echo "=================================================================="
|
source/scripts/prepare_3b_data.sh
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# prepare_3b_data.sh — 3B 모델 학습 데이터 전체 파이프라인
|
| 4 |
+
#
|
| 5 |
+
# 사용법:
|
| 6 |
+
# bash scripts/prepare_3b_data.sh [--step N] [--jobs 72]
|
| 7 |
+
#
|
| 8 |
+
# 스텝:
|
| 9 |
+
# 1 = CulturaX 토큰화
|
| 10 |
+
# 2 = cc100 해제 + 토큰화
|
| 11 |
+
# 3 = OSCAR 토큰화
|
| 12 |
+
# 4 = korean_webtext 토큰화
|
| 13 |
+
# 5 = HPLT 한국어 추출 + 토큰화
|
| 14 |
+
# 6 = textbooks + finepdfs + kovast 토큰화
|
| 15 |
+
# 7 = 전체 병합
|
| 16 |
+
# 8 = train/val split 검증
|
| 17 |
+
# =============================================================================
|
| 18 |
+
set -euo pipefail
|
| 19 |
+
|
| 20 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 21 |
+
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 22 |
+
cd "${PROJECT_ROOT}"
|
| 23 |
+
|
| 24 |
+
# ─── 설정 ────────────────────────────────────────────────────────────────
|
| 25 |
+
DATA_DIR="data"
|
| 26 |
+
EXTRA_DIR="data/korean_extra"
|
| 27 |
+
TOKENIZER="tokenizer/tokenizer.json"
|
| 28 |
+
VAL_SPLIT=0.002
|
| 29 |
+
SEED=42
|
| 30 |
+
JOBS=72
|
| 31 |
+
FROM_STEP=0
|
| 32 |
+
LOG_FILE="data/prepare_3b.log"
|
| 33 |
+
|
| 34 |
+
while [[ $# -gt 0 ]]; do
|
| 35 |
+
case $1 in
|
| 36 |
+
--step) FROM_STEP="$2"; shift 2 ;;
|
| 37 |
+
--jobs) JOBS="$2"; shift 2 ;;
|
| 38 |
+
*) echo "Unknown arg: $1"; exit 1 ;;
|
| 39 |
+
esac
|
| 40 |
+
done
|
| 41 |
+
|
| 42 |
+
mkdir -p "$(dirname "$LOG_FILE")"
|
| 43 |
+
exec > >(tee -a "$LOG_FILE") 2>&1
|
| 44 |
+
|
| 45 |
+
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
|
| 46 |
+
|
| 47 |
+
# ─── 토큰화 헬퍼 (parquet → bin) ─────────────────────────────────────────
|
| 48 |
+
tokenize_parquet() {
|
| 49 |
+
local name="$1"
|
| 50 |
+
local input_pattern="$2"
|
| 51 |
+
local text_col="$3"
|
| 52 |
+
local output="${DATA_DIR}/${name}_train.bin"
|
| 53 |
+
|
| 54 |
+
if [[ -f "$output" && $FROM_STEP -le 0 ]]; then
|
| 55 |
+
log "[SKIP] $output already exists ($(du -h "$output" | cut -f1))"
|
| 56 |
+
return
|
| 57 |
+
fi
|
| 58 |
+
|
| 59 |
+
log "[START] Tokenizing $name from parquet..."
|
| 60 |
+
python3 - <<PYEOF
|
| 61 |
+
import glob, os, sys
|
| 62 |
+
import numpy as np
|
| 63 |
+
from tokenizers import Tokenizer
|
| 64 |
+
import pyarrow.parquet as pq
|
| 65 |
+
from tqdm import tqdm
|
| 66 |
+
from concurrent.futures import ProcessPoolExecutor
|
| 67 |
+
import multiprocessing as mp
|
| 68 |
+
|
| 69 |
+
tokenizer_path = "${TOKENIZER}"
|
| 70 |
+
input_pattern = "${input_pattern}"
|
| 71 |
+
text_col = "${text_col}"
|
| 72 |
+
output_train = "${output}"
|
| 73 |
+
output_val = output_train.replace("_train.bin", "_val.bin")
|
| 74 |
+
val_split = ${VAL_SPLIT}
|
| 75 |
+
seed = ${SEED}
|
| 76 |
+
|
| 77 |
+
files = sorted(glob.glob(input_pattern))
|
| 78 |
+
print(f"Found {len(files)} parquet files")
|
| 79 |
+
|
| 80 |
+
tokenizer = Tokenizer.from_file(tokenizer_path)
|
| 81 |
+
|
| 82 |
+
all_tokens = []
|
| 83 |
+
total_docs = 0
|
| 84 |
+
|
| 85 |
+
for f in tqdm(files, desc="${name}"):
|
| 86 |
+
try:
|
| 87 |
+
table = pq.read_table(f, columns=[text_col])
|
| 88 |
+
for text in table.column(text_col):
|
| 89 |
+
t = text.as_py()
|
| 90 |
+
if t and len(t) > 50:
|
| 91 |
+
ids = tokenizer.encode(t).ids
|
| 92 |
+
all_tokens.extend(ids)
|
| 93 |
+
total_docs += 1
|
| 94 |
+
except Exception as e:
|
| 95 |
+
print(f"Error processing {f}: {e}", file=sys.stderr)
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
print(f"Total: {total_docs:,} docs, {len(all_tokens):,} tokens")
|
| 99 |
+
|
| 100 |
+
# Split
|
| 101 |
+
import random
|
| 102 |
+
random.seed(seed)
|
| 103 |
+
random.shuffle(all_tokens) # Not ideal but matches existing code
|
| 104 |
+
n_val = int(len(all_tokens) * val_split)
|
| 105 |
+
val_tokens = all_tokens[:n_val]
|
| 106 |
+
train_tokens = all_tokens[n_val:]
|
| 107 |
+
|
| 108 |
+
np.array(train_tokens, dtype=np.uint16).tofile(output_train)
|
| 109 |
+
np.array(val_tokens, dtype=np.uint16).tofile(output_val)
|
| 110 |
+
print(f"Saved: {output_train} ({len(train_tokens):,} tokens)")
|
| 111 |
+
print(f"Saved: {output_val} ({len(val_tokens):,} tokens)")
|
| 112 |
+
PYEOF
|
| 113 |
+
log "[DONE] $name → $output"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
# ─── Step 1: CulturaX ────────────────────────────────────────────────────
|
| 117 |
+
if [[ $FROM_STEP -le 1 ]]; then
|
| 118 |
+
log "=== Step 1: CulturaX 토큰화 ==="
|
| 119 |
+
tokenize_parquet "culturax" \
|
| 120 |
+
"${EXTRA_DIR}/culturax_ko/ko/*.parquet" \
|
| 121 |
+
"text"
|
| 122 |
+
fi
|
| 123 |
+
|
| 124 |
+
# ─── Step 2: cc100 해제 + 토큰화 ─────────────────────────────────────────
|
| 125 |
+
if [[ $FROM_STEP -le 2 ]]; then
|
| 126 |
+
log "=== Step 2: cc100 해제 + 토큰화 ==="
|
| 127 |
+
CC100_XZ="${EXTRA_DIR}/cc100_ko/ko.txt.xz"
|
| 128 |
+
CC100_TXT="${EXTRA_DIR}/cc100_ko/ko.txt"
|
| 129 |
+
CC100_OUT="${DATA_DIR}/cc100_train.bin"
|
| 130 |
+
|
| 131 |
+
if [[ -f "$CC100_OUT" && $FROM_STEP -le 0 ]]; then
|
| 132 |
+
log "[SKIP] cc100 already tokenized"
|
| 133 |
+
else
|
| 134 |
+
# 해제
|
| 135 |
+
if [[ ! -f "$CC100_TXT" ]]; then
|
| 136 |
+
log "Decompressing cc100 xz (14GB → 54GB)..."
|
| 137 |
+
xz -dk "$CC100_XZ"
|
| 138 |
+
log "Decompression done"
|
| 139 |
+
fi
|
| 140 |
+
|
| 141 |
+
# 토큰화 (대용량 → 스트리밍)
|
| 142 |
+
log "Tokenizing cc100 (54GB text)..."
|
| 143 |
+
python3 - <<'PYEOF'
|
| 144 |
+
import numpy as np
|
| 145 |
+
from tokenizers import Tokenizer
|
| 146 |
+
from tqdm import tqdm
|
| 147 |
+
import random
|
| 148 |
+
|
| 149 |
+
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
|
| 150 |
+
input_file = "data/korean_extra/cc100_ko/ko.txt"
|
| 151 |
+
output_train = "data/cc100_train.bin"
|
| 152 |
+
output_val = "data/cc100_val.bin"
|
| 153 |
+
|
| 154 |
+
# Stream tokenize in chunks
|
| 155 |
+
all_tokens = []
|
| 156 |
+
doc_buffer = []
|
| 157 |
+
doc_count = 0
|
| 158 |
+
|
| 159 |
+
with open(input_file, 'r', encoding='utf-8', errors='replace') as f:
|
| 160 |
+
for line in tqdm(f, desc="cc100", unit=" lines"):
|
| 161 |
+
line = line.strip()
|
| 162 |
+
if not line:
|
| 163 |
+
# Document boundary
|
| 164 |
+
if doc_buffer:
|
| 165 |
+
text = '\n'.join(doc_buffer)
|
| 166 |
+
if len(text) > 50:
|
| 167 |
+
ids = tokenizer.encode(text).ids
|
| 168 |
+
all_tokens.extend(ids)
|
| 169 |
+
doc_count += 1
|
| 170 |
+
doc_buffer = []
|
| 171 |
+
else:
|
| 172 |
+
doc_buffer.append(line)
|
| 173 |
+
|
| 174 |
+
# Last doc
|
| 175 |
+
if doc_buffer:
|
| 176 |
+
text = '\n'.join(doc_buffer)
|
| 177 |
+
if len(text) > 50:
|
| 178 |
+
all_tokens.extend(tokenizer.encode(text).ids)
|
| 179 |
+
doc_count += 1
|
| 180 |
+
|
| 181 |
+
print(f"Total: {doc_count:,} docs, {len(all_tokens):,} tokens")
|
| 182 |
+
|
| 183 |
+
# Split
|
| 184 |
+
n_val = int(len(all_tokens) * 0.002)
|
| 185 |
+
np.array(all_tokens[n_val:], dtype=np.uint16).tofile(output_train)
|
| 186 |
+
np.array(all_tokens[:n_val], dtype=np.uint16).tofile(output_val)
|
| 187 |
+
print(f"Saved train: {len(all_tokens)-n_val:,} tokens")
|
| 188 |
+
print(f"Saved val: {n_val:,} tokens")
|
| 189 |
+
PYEOF
|
| 190 |
+
log "[DONE] cc100"
|
| 191 |
+
fi
|
| 192 |
+
fi
|
| 193 |
+
|
| 194 |
+
# ─── Step 3: OSCAR ───────────────────────────────────────────────────────
|
| 195 |
+
if [[ $FROM_STEP -le 3 ]]; then
|
| 196 |
+
log "=== Step 3: OSCAR 토큰화 ==="
|
| 197 |
+
OSCAR_OUT="${DATA_DIR}/oscar_train.bin"
|
| 198 |
+
|
| 199 |
+
if [[ -f "$OSCAR_OUT" && $FROM_STEP -le 0 ]]; then
|
| 200 |
+
log "[SKIP] OSCAR already tokenized"
|
| 201 |
+
else
|
| 202 |
+
python3 - <<'PYEOF'
|
| 203 |
+
import glob, numpy as np
|
| 204 |
+
from tokenizers import Tokenizer
|
| 205 |
+
import pyarrow.parquet as pq
|
| 206 |
+
from tqdm import tqdm
|
| 207 |
+
|
| 208 |
+
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
|
| 209 |
+
files = sorted(glob.glob("data/korean_extra/oscar_ko/data/kor_Hang/*.parquet"))
|
| 210 |
+
all_tokens = []
|
| 211 |
+
doc_count = 0
|
| 212 |
+
|
| 213 |
+
for f in tqdm(files, desc="OSCAR"):
|
| 214 |
+
table = pq.read_table(f, columns=['text'])
|
| 215 |
+
for row in table.column('text'):
|
| 216 |
+
if row is None:
|
| 217 |
+
continue
|
| 218 |
+
parts = row.as_py()
|
| 219 |
+
if parts:
|
| 220 |
+
text = '\n'.join(item['text'] for item in parts if item and item.get('text'))
|
| 221 |
+
if len(text) > 50:
|
| 222 |
+
all_tokens.extend(tokenizer.encode(text).ids)
|
| 223 |
+
doc_count += 1
|
| 224 |
+
|
| 225 |
+
print(f"OSCAR: {doc_count:,} docs, {len(all_tokens):,} tokens")
|
| 226 |
+
n_val = int(len(all_tokens) * 0.002)
|
| 227 |
+
np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/oscar_train.bin")
|
| 228 |
+
np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/oscar_val.bin")
|
| 229 |
+
PYEOF
|
| 230 |
+
log "[DONE] OSCAR"
|
| 231 |
+
fi
|
| 232 |
+
fi
|
| 233 |
+
|
| 234 |
+
# ─── Step 4: korean_webtext ──────────────────────────────────────────────
|
| 235 |
+
if [[ $FROM_STEP -le 4 ]]; then
|
| 236 |
+
log "=== Step 4: korean_webtext 토큰화 ==="
|
| 237 |
+
tokenize_parquet "webtext" \
|
| 238 |
+
"${EXTRA_DIR}/korean_webtext/data/*.parquet" \
|
| 239 |
+
"text"
|
| 240 |
+
fi
|
| 241 |
+
|
| 242 |
+
# ─── Step 5: HPLT 한국어 추출 + 토큰화 ──────────────────────────────────
|
| 243 |
+
if [[ $FROM_STEP -le 5 ]]; then
|
| 244 |
+
log "=== Step 5: HPLT 한국어 추출 + 토큰화 ==="
|
| 245 |
+
HPLT_OUT="${DATA_DIR}/hplt_ko_train.bin"
|
| 246 |
+
|
| 247 |
+
if [[ -f "$HPLT_OUT" && $FROM_STEP -le 0 ]]; then
|
| 248 |
+
log "[SKIP] HPLT already tokenized"
|
| 249 |
+
else
|
| 250 |
+
python3 - <<'PYEOF'
|
| 251 |
+
import glob, numpy as np
|
| 252 |
+
from tokenizers import Tokenizer
|
| 253 |
+
import pyarrow.parquet as pq
|
| 254 |
+
from tqdm import tqdm
|
| 255 |
+
|
| 256 |
+
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
|
| 257 |
+
files = sorted(glob.glob("data/korean_extra/hplt_ko/en-ko/*.parquet"))
|
| 258 |
+
all_tokens = []
|
| 259 |
+
doc_count = 0
|
| 260 |
+
|
| 261 |
+
for f in tqdm(files, desc="HPLT"):
|
| 262 |
+
table = pq.read_table(f, columns=['tgt_doc'])
|
| 263 |
+
for row in table.column('tgt_doc'):
|
| 264 |
+
d = row.as_py()
|
| 265 |
+
if d and d.get('sentences'):
|
| 266 |
+
text = '\n'.join(s for s in d['sentences'] if s)
|
| 267 |
+
if len(text) > 50:
|
| 268 |
+
all_tokens.extend(tokenizer.encode(text).ids)
|
| 269 |
+
doc_count += 1
|
| 270 |
+
|
| 271 |
+
print(f"HPLT Korean: {doc_count:,} docs, {len(all_tokens):,} tokens")
|
| 272 |
+
n_val = int(len(all_tokens) * 0.002)
|
| 273 |
+
np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/hplt_ko_train.bin")
|
| 274 |
+
np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/hplt_ko_val.bin")
|
| 275 |
+
PYEOF
|
| 276 |
+
log "[DONE] HPLT"
|
| 277 |
+
fi
|
| 278 |
+
fi
|
| 279 |
+
|
| 280 |
+
# ─── Step 6: textbooks + finepdfs + kovast ───────────────────────────────
|
| 281 |
+
if [[ $FROM_STEP -le 6 ]]; then
|
| 282 |
+
log "=== Step 6: 기타 소스 토큰화 ==="
|
| 283 |
+
EXTRA_OUT="${DATA_DIR}/extra_misc_train.bin"
|
| 284 |
+
|
| 285 |
+
if [[ -f "$EXTRA_OUT" && $FROM_STEP -le 0 ]]; then
|
| 286 |
+
log "[SKIP] extra_misc already tokenized"
|
| 287 |
+
else
|
| 288 |
+
python3 - <<'PYEOF'
|
| 289 |
+
import glob, numpy as np, os
|
| 290 |
+
from tokenizers import Tokenizer
|
| 291 |
+
import pyarrow.parquet as pq
|
| 292 |
+
from tqdm import tqdm
|
| 293 |
+
|
| 294 |
+
tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
|
| 295 |
+
all_tokens = []
|
| 296 |
+
doc_count = 0
|
| 297 |
+
|
| 298 |
+
# korean_textbooks (MMLU-style: look for text columns)
|
| 299 |
+
tb_files = glob.glob("data/korean_extra/korean_textbooks/**/*.parquet", recursive=True)
|
| 300 |
+
for f in tqdm(tb_files, desc="textbooks"):
|
| 301 |
+
try:
|
| 302 |
+
table = pq.read_table(f)
|
| 303 |
+
# Try common text columns
|
| 304 |
+
for col in ['question', 'text', 'input', 'instruction']:
|
| 305 |
+
if col in table.column_names:
|
| 306 |
+
for val in table.column(col):
|
| 307 |
+
t = val.as_py()
|
| 308 |
+
if t and len(t) > 20:
|
| 309 |
+
all_tokens.extend(tokenizer.encode(t).ids)
|
| 310 |
+
doc_count += 1
|
| 311 |
+
break
|
| 312 |
+
except:
|
| 313 |
+
continue
|
| 314 |
+
|
| 315 |
+
# finepdfs
|
| 316 |
+
pdf_files = glob.glob("data/korean_extra/finepdfs_edu_ko/*.parquet")
|
| 317 |
+
for f in tqdm(pdf_files, desc="finepdfs"):
|
| 318 |
+
try:
|
| 319 |
+
table = pq.read_table(f)
|
| 320 |
+
for col in ['text', 'content']:
|
| 321 |
+
if col in table.column_names:
|
| 322 |
+
for val in table.column(col):
|
| 323 |
+
t = val.as_py()
|
| 324 |
+
if t and len(t) > 50:
|
| 325 |
+
all_tokens.extend(tokenizer.encode(t).ids)
|
| 326 |
+
doc_count += 1
|
| 327 |
+
break
|
| 328 |
+
except:
|
| 329 |
+
continue
|
| 330 |
+
|
| 331 |
+
print(f"Extra: {doc_count:,} docs, {len(all_tokens):,} tokens")
|
| 332 |
+
n_val = int(len(all_tokens) * 0.002)
|
| 333 |
+
np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/extra_misc_train.bin")
|
| 334 |
+
np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/extra_misc_val.bin")
|
| 335 |
+
PYEOF
|
| 336 |
+
log "[DONE] extra_misc"
|
| 337 |
+
fi
|
| 338 |
+
fi
|
| 339 |
+
|
| 340 |
+
# ─── Step 7: 전체 병합 ──────────────────────────────────────────────────
|
| 341 |
+
if [[ $FROM_STEP -le 7 ]]; then
|
| 342 |
+
log "=== Step 7: 전체 병합 ==="
|
| 343 |
+
|
| 344 |
+
TRAIN_BINS=""
|
| 345 |
+
for f in \
|
| 346 |
+
"${DATA_DIR}/korean_train.bin" \
|
| 347 |
+
"${DATA_DIR}/culturax_train.bin" \
|
| 348 |
+
"${DATA_DIR}/cc100_train.bin" \
|
| 349 |
+
"${DATA_DIR}/oscar_train.bin" \
|
| 350 |
+
"${DATA_DIR}/webtext_train.bin" \
|
| 351 |
+
"${DATA_DIR}/hplt_ko_train.bin" \
|
| 352 |
+
"${DATA_DIR}/extra_misc_train.bin"; do
|
| 353 |
+
if [[ -f "$f" ]]; then
|
| 354 |
+
TRAIN_BINS="$TRAIN_BINS $f"
|
| 355 |
+
log " Including: $f ($(du -h "$f" | cut -f1))"
|
| 356 |
+
else
|
| 357 |
+
log " [WARN] Missing: $f"
|
| 358 |
+
fi
|
| 359 |
+
done
|
| 360 |
+
|
| 361 |
+
if [[ -n "$TRAIN_BINS" ]]; then
|
| 362 |
+
python3 data/merge_bins.py $TRAIN_BINS "${DATA_DIR}/merged_3b_train.bin"
|
| 363 |
+
log "[DONE] merged_3b_train.bin created"
|
| 364 |
+
fi
|
| 365 |
+
|
| 366 |
+
# Val 병합
|
| 367 |
+
VAL_BINS=""
|
| 368 |
+
for f in \
|
| 369 |
+
"${DATA_DIR}/korean_val.bin" \
|
| 370 |
+
"${DATA_DIR}/culturax_val.bin" \
|
| 371 |
+
"${DATA_DIR}/cc100_val.bin" \
|
| 372 |
+
"${DATA_DIR}/oscar_val.bin" \
|
| 373 |
+
"${DATA_DIR}/webtext_val.bin" \
|
| 374 |
+
"${DATA_DIR}/hplt_ko_val.bin" \
|
| 375 |
+
"${DATA_DIR}/extra_misc_val.bin"; do
|
| 376 |
+
if [[ -f "$f" ]]; then
|
| 377 |
+
VAL_BINS="$VAL_BINS $f"
|
| 378 |
+
fi
|
| 379 |
+
done
|
| 380 |
+
|
| 381 |
+
if [[ -n "$VAL_BINS" ]]; then
|
| 382 |
+
python3 data/merge_bins.py $VAL_BINS "${DATA_DIR}/merged_3b_val.bin"
|
| 383 |
+
log "[DONE] merged_3b_val.bin created"
|
| 384 |
+
fi
|
| 385 |
+
fi
|
| 386 |
+
|
| 387 |
+
# ─── Step 8: 검증 ────────────────────────────────────────────────────────
|
| 388 |
+
if [[ $FROM_STEP -le 8 ]]; then
|
| 389 |
+
log "=== Step 8: 최종 검증 ==="
|
| 390 |
+
python3 - <<'PYEOF'
|
| 391 |
+
import os, glob
|
| 392 |
+
import numpy as np
|
| 393 |
+
|
| 394 |
+
print("=== 토큰화 결과 ===")
|
| 395 |
+
total_train = 0
|
| 396 |
+
total_val = 0
|
| 397 |
+
for f in sorted(glob.glob("data/*_train.bin") + glob.glob("data/train.bin")):
|
| 398 |
+
n = os.path.getsize(f) // 2
|
| 399 |
+
total_train += n
|
| 400 |
+
print(f" {os.path.basename(f):30s}: {n:>15,} tokens ({os.path.getsize(f)/1e9:.2f} GB)")
|
| 401 |
+
|
| 402 |
+
for f in sorted(glob.glob("data/*_val.bin") + glob.glob("data/val.bin")):
|
| 403 |
+
n = os.path.getsize(f) // 2
|
| 404 |
+
total_val += n
|
| 405 |
+
|
| 406 |
+
print(f"\n Total train: {total_train:,} tokens ({total_train/1e9:.1f}B)")
|
| 407 |
+
print(f" Total val: {total_val:,} tokens ({total_val/1e6:.1f}M)")
|
| 408 |
+
print(f"\n 3B Chinchilla minimum: 60B tokens")
|
| 409 |
+
print(f" Epochs needed for 60B: {60e9/total_train:.1f}")
|
| 410 |
+
print(f" Epochs needed for 100B: {100e9/total_train:.1f}")
|
| 411 |
+
PYEOF
|
| 412 |
+
fi
|
| 413 |
+
|
| 414 |
+
log "=== 파이프라인 완료 ==="
|
source/scripts/prepare_sft_combined.sh
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# prepare_sft_combined.sh — 3B SFT용 전체 데이터 통합
|
| 3 |
+
# 모든 SFT 데이터를 하나의 train/val 파일로 합침
|
| 4 |
+
#
|
| 5 |
+
# 업데이트 (2026-03-02): sft_extra 신규 소스 추가
|
| 6 |
+
# - nayohan_Evol-Instruct-Code-80k-v1-ko (코드 instruction)
|
| 7 |
+
# - FreedomIntelligence_alpaca-gpt4-korean (GPT-4 alpaca 한국어)
|
| 8 |
+
# - FreedomIntelligence_evol-instruct-korean (evol-instruct 한국어)
|
| 9 |
+
# - coastral_korean-writing-style-instruct (한국어 글쓰기 스타일)
|
| 10 |
+
# - maywell_ko_wikidata_QA (위키데이터 QA)
|
| 11 |
+
# - OpenAssistant_oasst1_ko (OASST1 한국어, 트리 재구성)
|
| 12 |
+
# - Bllossom_evol-instruct-ko (존재 확인 후 로드)
|
| 13 |
+
set -euo pipefail
|
| 14 |
+
BASE="$(cd "$(dirname "$0")/.." && pwd)"
|
| 15 |
+
OUT_DIR="$BASE/data/sft_combined"
|
| 16 |
+
mkdir -p "$OUT_DIR"
|
| 17 |
+
|
| 18 |
+
python3 << 'PYEOF'
|
| 19 |
+
import json, random, os, glob, hashlib
|
| 20 |
+
from collections import defaultdict
|
| 21 |
+
|
| 22 |
+
BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data"
|
| 23 |
+
OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl"
|
| 24 |
+
OUT_VAL = f"{BASE}/sft_combined/val.jsonl"
|
| 25 |
+
VAL_RATIO = 0.02
|
| 26 |
+
SEED = 42
|
| 27 |
+
|
| 28 |
+
# SFT 소스 파일 목록 (chat 포맷으로 변환 가능한 것들)
|
| 29 |
+
SOURCES = [
|
| 30 |
+
# (path, fmt) fmt: "messages" | "auto" | "oasst"
|
| 31 |
+
(f"{BASE}/sft/train.jsonl", "messages"),
|
| 32 |
+
(f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"),
|
| 33 |
+
(f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"),
|
| 34 |
+
(f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"),
|
| 35 |
+
(f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"),
|
| 36 |
+
(f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"),
|
| 37 |
+
(f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"),
|
| 38 |
+
(f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"),
|
| 39 |
+
(f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"),
|
| 40 |
+
(f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"),
|
| 41 |
+
(f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"),
|
| 42 |
+
(f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"),
|
| 43 |
+
(f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"),
|
| 44 |
+
(f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"),
|
| 45 |
+
(f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"),
|
| 46 |
+
(f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"),
|
| 47 |
+
(f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"),
|
| 48 |
+
(f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"),
|
| 49 |
+
# ---- 신규 소스 (2026-03-02) ----
|
| 50 |
+
(f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"),
|
| 51 |
+
(f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"),
|
| 52 |
+
(f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"),
|
| 53 |
+
(f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"),
|
| 54 |
+
(f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"),
|
| 55 |
+
(f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"),
|
| 56 |
+
(f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"),
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
def to_messages(obj):
|
| 60 |
+
"""다양한 포맷을 통일된 messages 포맷으로 변환"""
|
| 61 |
+
# 이미 messages 포맷
|
| 62 |
+
if 'messages' in obj and isinstance(obj['messages'], list):
|
| 63 |
+
return obj['messages']
|
| 64 |
+
# conversations 포맷
|
| 65 |
+
if 'conversations' in obj:
|
| 66 |
+
msgs = []
|
| 67 |
+
for turn in obj['conversations']:
|
| 68 |
+
role = turn.get('from', turn.get('role', ''))
|
| 69 |
+
content = turn.get('value', turn.get('content', ''))
|
| 70 |
+
if role in ('human', 'user', 'prompter'):
|
| 71 |
+
msgs.append({'role': 'user', 'content': content})
|
| 72 |
+
elif role in ('gpt', 'assistant', 'bot'):
|
| 73 |
+
msgs.append({'role': 'assistant', 'content': content})
|
| 74 |
+
return msgs if len(msgs) >= 2 else None
|
| 75 |
+
# instruction/output 포맷
|
| 76 |
+
if 'instruction' in obj:
|
| 77 |
+
instruction = obj['instruction']
|
| 78 |
+
inp = obj.get('input', '')
|
| 79 |
+
output = obj.get('output', obj.get('response', ''))
|
| 80 |
+
if not output: return None
|
| 81 |
+
user_content = instruction + ('\n\n' + inp if inp else '')
|
| 82 |
+
return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}]
|
| 83 |
+
# question/answer 포맷
|
| 84 |
+
if 'question' in obj and 'answer' in obj:
|
| 85 |
+
return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}]
|
| 86 |
+
# prompt/response
|
| 87 |
+
if 'prompt' in obj and ('response' in obj or 'completion' in obj):
|
| 88 |
+
resp = obj.get('response', obj.get('completion', ''))
|
| 89 |
+
return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}]
|
| 90 |
+
# problem/solution
|
| 91 |
+
if 'problem' in obj and 'solution' in obj:
|
| 92 |
+
return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}]
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def load_oasst(path):
|
| 97 |
+
"""
|
| 98 |
+
OpenAssistant OASST1 flat message 포맷을 대화 트리로 재구성.
|
| 99 |
+
각 루트(prompter) 메시지에서 best-ranked assistant 응답(rank=0.0)을
|
| 100 |
+
따라 단일 대화 스레드를 추출한다.
|
| 101 |
+
deleted=True 메시지와 review_result=False 메시지는 제외.
|
| 102 |
+
"""
|
| 103 |
+
nodes = {} # message_id → obj
|
| 104 |
+
children = defaultdict(list) # parent_id → [child_obj, ...]
|
| 105 |
+
|
| 106 |
+
with open(path, 'r', errors='replace') as f:
|
| 107 |
+
for line in f:
|
| 108 |
+
line = line.strip()
|
| 109 |
+
if not line:
|
| 110 |
+
continue
|
| 111 |
+
try:
|
| 112 |
+
obj = json.loads(line)
|
| 113 |
+
except Exception:
|
| 114 |
+
continue
|
| 115 |
+
if obj.get('deleted', False):
|
| 116 |
+
continue
|
| 117 |
+
if obj.get('review_result') is False:
|
| 118 |
+
continue
|
| 119 |
+
mid = obj.get('message_id')
|
| 120 |
+
if mid:
|
| 121 |
+
nodes[mid] = obj
|
| 122 |
+
pid = obj.get('parent_id')
|
| 123 |
+
if pid:
|
| 124 |
+
children[pid].append(obj)
|
| 125 |
+
|
| 126 |
+
# 자식 목록을 rank 오름차순 정렬 (rank=null은 뒤로)
|
| 127 |
+
def sort_key(c):
|
| 128 |
+
r = c.get('rank')
|
| 129 |
+
mid = c.get('message_id', '')
|
| 130 |
+
return (1, 0, mid) if r is None else (0, r, mid)
|
| 131 |
+
for pid in children:
|
| 132 |
+
children[pid].sort(key=sort_key)
|
| 133 |
+
|
| 134 |
+
samples = []
|
| 135 |
+
|
| 136 |
+
def build_thread(node, current_msgs):
|
| 137 |
+
"""재귀적으로 대화 스레드를 따라 samples에 추가."""
|
| 138 |
+
role = node.get('role', '')
|
| 139 |
+
text = node.get('text', '')
|
| 140 |
+
if role == 'prompter':
|
| 141 |
+
mapped_role = 'user'
|
| 142 |
+
elif role == 'assistant':
|
| 143 |
+
mapped_role = 'assistant'
|
| 144 |
+
else:
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
msgs = current_msgs + [{'role': mapped_role, 'content': text}]
|
| 148 |
+
|
| 149 |
+
# 유효한 user→assistant 쌍이 있을 때만 샘플 추가
|
| 150 |
+
if mapped_role == 'assistant' and len(msgs) >= 2:
|
| 151 |
+
samples.append({'messages': msgs})
|
| 152 |
+
|
| 153 |
+
# 자식 중 best (rank=0.0) 하나만 따라간다 (가장 품질 높은 경로)
|
| 154 |
+
kids = children.get(node.get('message_id'), [])
|
| 155 |
+
if kids:
|
| 156 |
+
build_thread(kids[0], msgs)
|
| 157 |
+
|
| 158 |
+
# 루트 노드: parent_id가 없는 prompter 메시지
|
| 159 |
+
roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter']
|
| 160 |
+
for root in roots:
|
| 161 |
+
build_thread(root, [])
|
| 162 |
+
|
| 163 |
+
return samples
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
random.seed(SEED)
|
| 167 |
+
all_samples = []
|
| 168 |
+
|
| 169 |
+
for path, fmt in SOURCES:
|
| 170 |
+
if not os.path.exists(path):
|
| 171 |
+
print(f"[SKIP] {path}")
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
if fmt == "oasst":
|
| 175 |
+
samples = load_oasst(path)
|
| 176 |
+
all_samples.extend(samples)
|
| 177 |
+
print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)")
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
count = 0
|
| 181 |
+
with open(path, 'r', errors='replace') as f:
|
| 182 |
+
for line in f:
|
| 183 |
+
line = line.strip()
|
| 184 |
+
if not line: continue
|
| 185 |
+
try:
|
| 186 |
+
obj = json.loads(line)
|
| 187 |
+
except Exception:
|
| 188 |
+
continue
|
| 189 |
+
if fmt == "messages":
|
| 190 |
+
msgs = obj.get('messages') or obj.get('conversations')
|
| 191 |
+
if msgs:
|
| 192 |
+
all_samples.append({'messages': msgs})
|
| 193 |
+
count += 1
|
| 194 |
+
else: # auto detect
|
| 195 |
+
msgs = to_messages(obj)
|
| 196 |
+
if msgs and len(msgs) >= 2:
|
| 197 |
+
all_samples.append({'messages': msgs})
|
| 198 |
+
count += 1
|
| 199 |
+
print(f"[LOADED] {os.path.basename(path)}: {count:,} samples")
|
| 200 |
+
if count == 0:
|
| 201 |
+
print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)")
|
| 202 |
+
|
| 203 |
+
print(f"\n총 샘플: {len(all_samples):,}")
|
| 204 |
+
|
| 205 |
+
# ---- Deduplication (MD5 of first user message) ----
|
| 206 |
+
seen_hashes = set()
|
| 207 |
+
unique_samples = []
|
| 208 |
+
dup_count = 0
|
| 209 |
+
for s in all_samples:
|
| 210 |
+
msgs = s.get('messages', [])
|
| 211 |
+
first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '')
|
| 212 |
+
h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest()
|
| 213 |
+
if h in seen_hashes:
|
| 214 |
+
dup_count += 1
|
| 215 |
+
continue
|
| 216 |
+
seen_hashes.add(h)
|
| 217 |
+
unique_samples.append(s)
|
| 218 |
+
|
| 219 |
+
print(f"[DEDUP] 제거: {dup_count:,}, 남은 샘플: {len(unique_samples):,}")
|
| 220 |
+
all_samples = unique_samples
|
| 221 |
+
|
| 222 |
+
# ---- Format validation ----
|
| 223 |
+
def validate_messages(msgs):
|
| 224 |
+
"""Check messages have valid role/content structure."""
|
| 225 |
+
if not isinstance(msgs, list) or len(msgs) < 2:
|
| 226 |
+
return False
|
| 227 |
+
for m in msgs:
|
| 228 |
+
if not isinstance(m, dict):
|
| 229 |
+
return False
|
| 230 |
+
if m.get('role') not in ('user', 'assistant', 'system'):
|
| 231 |
+
return False
|
| 232 |
+
if not isinstance(m.get('content'), str):
|
| 233 |
+
return False
|
| 234 |
+
return True
|
| 235 |
+
|
| 236 |
+
valid_samples = []
|
| 237 |
+
invalid_count = 0
|
| 238 |
+
for s in all_samples:
|
| 239 |
+
if validate_messages(s.get('messages', [])):
|
| 240 |
+
valid_samples.append(s)
|
| 241 |
+
else:
|
| 242 |
+
invalid_count += 1
|
| 243 |
+
|
| 244 |
+
print(f"[VALIDATE] 유효하지 않은 포맷 제거: {invalid_count:,}, 남은 샘플: {len(valid_samples):,}")
|
| 245 |
+
all_samples = valid_samples
|
| 246 |
+
|
| 247 |
+
random.shuffle(all_samples)
|
| 248 |
+
|
| 249 |
+
n_val = int(len(all_samples) * VAL_RATIO)
|
| 250 |
+
val_samples = all_samples[:n_val]
|
| 251 |
+
train_samples = all_samples[n_val:]
|
| 252 |
+
|
| 253 |
+
os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True)
|
| 254 |
+
with open(OUT_TRAIN, 'w') as f:
|
| 255 |
+
for s in train_samples:
|
| 256 |
+
f.write(json.dumps(s, ensure_ascii=False) + '\n')
|
| 257 |
+
with open(OUT_VAL, 'w') as f:
|
| 258 |
+
for s in val_samples:
|
| 259 |
+
f.write(json.dumps(s, ensure_ascii=False) + '\n')
|
| 260 |
+
|
| 261 |
+
print(f"[DONE] train: {len(train_samples):,} → {OUT_TRAIN}")
|
| 262 |
+
print(f"[DONE] val: {len(val_samples):,} → {OUT_VAL}")
|
| 263 |
+
PYEOF
|
| 264 |
+
echo "SFT 데이터 병합 완료"
|
source/scripts/quality_gate.sh
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# quality_gate.sh — Phase 완료 자동 품질 게이트 검증
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/quality_gate.sh <phase>
|
| 7 |
+
#
|
| 8 |
+
# Phases:
|
| 9 |
+
# pretrain — 사전학습 게이트 (val_loss, loss 단조 감소)
|
| 10 |
+
# sft — SFT 게이트 (val_loss 수렴, 반복률, KoBEST)
|
| 11 |
+
# orpo — ORPO 게이트 (반복률, KoBEST, chosen > rejected)
|
| 12 |
+
# deploy — 배포 게이트 (GGUF perplexity, Ollama 응답)
|
| 13 |
+
# all — 모든 게이트 순차 실행
|
| 14 |
+
#
|
| 15 |
+
# Exit codes:
|
| 16 |
+
# 0 — 게이트 통과
|
| 17 |
+
# 1 — 게이트 실패 (기준 미달)
|
| 18 |
+
# 2 — 필수 파일 / 의존성 없음 (실행 불가)
|
| 19 |
+
# =============================================================================
|
| 20 |
+
set -uo pipefail
|
| 21 |
+
|
| 22 |
+
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# 색상 출력 헬퍼
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
_RED='\033[0;31m'
|
| 28 |
+
_GREEN='\033[0;32m'
|
| 29 |
+
_YELLOW='\033[1;33m'
|
| 30 |
+
_BLUE='\033[0;34m'
|
| 31 |
+
_NC='\033[0m'
|
| 32 |
+
|
| 33 |
+
log_info() { echo -e "${_BLUE}[INFO]${_NC} $*"; }
|
| 34 |
+
log_ok() { echo -e "${_GREEN}[PASS]${_NC} $*"; }
|
| 35 |
+
log_warn() { echo -e "${_YELLOW}[WARN]${_NC} $*"; }
|
| 36 |
+
log_fail() { echo -e "${_RED}[FAIL]${_NC} $*"; }
|
| 37 |
+
log_skip() { echo -e " [SKIP] $*"; }
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# 유틸리티: Python 한 줄 표현식 평가 (부동소수점 비교)
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
py_eval() {
|
| 43 |
+
python3 -c "import sys; sys.exit(0 if ($1) else 1)"
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
py_value() {
|
| 47 |
+
python3 -c "print($1)"
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# ---------------------------------------------------------------------------
|
| 51 |
+
# 유틸리티: JSON에서 값 추출
|
| 52 |
+
# ---------------------------------------------------------------------------
|
| 53 |
+
json_get() {
|
| 54 |
+
local file="$1" key="$2"
|
| 55 |
+
python3 -c "
|
| 56 |
+
import json, sys
|
| 57 |
+
try:
|
| 58 |
+
d = json.load(open('$file'))
|
| 59 |
+
keys = '$key'.split('.')
|
| 60 |
+
for k in keys:
|
| 61 |
+
d = d[k]
|
| 62 |
+
print(d)
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print('NOT_FOUND')
|
| 65 |
+
sys.exit(1)
|
| 66 |
+
"
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
# 게이트 결과 집계
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
GATE_PASS=0
|
| 73 |
+
GATE_FAIL=0
|
| 74 |
+
GATE_SKIP=0
|
| 75 |
+
|
| 76 |
+
record_pass() { GATE_PASS=$((GATE_PASS + 1)); log_ok "$*"; }
|
| 77 |
+
record_fail() { GATE_FAIL=$((GATE_FAIL + 1)); log_fail "$*"; }
|
| 78 |
+
record_skip() { GATE_SKIP=$((GATE_SKIP + 1)); log_skip "$*"; }
|
| 79 |
+
|
| 80 |
+
# =============================================================================
|
| 81 |
+
# Gate 1: Pretrain
|
| 82 |
+
# =============================================================================
|
| 83 |
+
gate_pretrain() {
|
| 84 |
+
echo ""
|
| 85 |
+
echo "=================================================================="
|
| 86 |
+
echo " Gate: PRETRAIN"
|
| 87 |
+
echo " 기준: val_loss < 2.5 | loss 단조 감소 확인"
|
| 88 |
+
echo "=================================================================="
|
| 89 |
+
|
| 90 |
+
# 최신 체크포인트 디렉토리 탐색
|
| 91 |
+
CKPT_BASE="$PROJECT_DIR/checkpoints"
|
| 92 |
+
METRICS_FILE=""
|
| 93 |
+
|
| 94 |
+
# metrics.json 또는 train_log.jsonl 탐색
|
| 95 |
+
for candidate in \
|
| 96 |
+
"$CKPT_BASE/korean_3b_fp8_pretrain/metrics.json" \
|
| 97 |
+
"$CKPT_BASE/korean_3b_pretrain/metrics.json" \
|
| 98 |
+
"$PROJECT_DIR/outputs/pretrain_metrics.json" \
|
| 99 |
+
"$PROJECT_DIR/logs/pretrain_metrics.json"
|
| 100 |
+
do
|
| 101 |
+
if [[ -f "$candidate" ]]; then
|
| 102 |
+
METRICS_FILE="$candidate"
|
| 103 |
+
break
|
| 104 |
+
fi
|
| 105 |
+
done
|
| 106 |
+
|
| 107 |
+
if [[ -z "$METRICS_FILE" ]]; then
|
| 108 |
+
log_warn "사전학습 메트릭 파일을 찾을 수 없습니다."
|
| 109 |
+
log_warn "찾는 경로: $CKPT_BASE/korean_3b_*/metrics.json"
|
| 110 |
+
log_warn "메트릭 파일이 없으면 학습 스크립트에서 아래 형식으로 저장하세요:"
|
| 111 |
+
log_warn ' {"val_loss": 2.3, "loss_history": [3.1, 2.8, 2.5, 2.3]}'
|
| 112 |
+
record_skip "메트릭 파일 없음 — 게이트 건너뜀"
|
| 113 |
+
return 0
|
| 114 |
+
fi
|
| 115 |
+
|
| 116 |
+
log_info "메트릭 파일: $METRICS_FILE"
|
| 117 |
+
|
| 118 |
+
# val_loss 확인
|
| 119 |
+
VAL_LOSS=$(json_get "$METRICS_FILE" "val_loss" 2>/dev/null || echo "NOT_FOUND")
|
| 120 |
+
if [[ "$VAL_LOSS" == "NOT_FOUND" ]]; then
|
| 121 |
+
record_skip "val_loss 키 없음 — 건너뜀"
|
| 122 |
+
else
|
| 123 |
+
log_info "val_loss = $VAL_LOSS (기준: < 2.5)"
|
| 124 |
+
if py_eval "$VAL_LOSS < 2.5" 2>/dev/null; then
|
| 125 |
+
record_pass "val_loss $VAL_LOSS < 2.5"
|
| 126 |
+
else
|
| 127 |
+
record_fail "val_loss $VAL_LOSS >= 2.5 (기준 미달)"
|
| 128 |
+
fi
|
| 129 |
+
fi
|
| 130 |
+
|
| 131 |
+
# loss 단조 감소 확인 (loss_history)
|
| 132 |
+
python3 - "$METRICS_FILE" <<'PYEOF'
|
| 133 |
+
import json, sys
|
| 134 |
+
|
| 135 |
+
metrics_file = sys.argv[1]
|
| 136 |
+
try:
|
| 137 |
+
d = json.load(open(metrics_file))
|
| 138 |
+
history = d.get("loss_history", [])
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"[SKIP] loss_history 읽기 실패: {e}")
|
| 141 |
+
sys.exit(0)
|
| 142 |
+
|
| 143 |
+
if len(history) < 2:
|
| 144 |
+
print(f"[SKIP] loss_history 데이터 부�� ({len(history)}개)")
|
| 145 |
+
sys.exit(0)
|
| 146 |
+
|
| 147 |
+
# 전체 추세가 감소하는지 확인 (처음 1/4 vs 마지막 1/4 평균 비교)
|
| 148 |
+
n = len(history)
|
| 149 |
+
q = max(1, n // 4)
|
| 150 |
+
early_avg = sum(history[:q]) / q
|
| 151 |
+
late_avg = sum(history[-q:]) / q
|
| 152 |
+
|
| 153 |
+
if late_avg < early_avg:
|
| 154 |
+
print(f"[PASS] loss 단조 감소 확인: 초기 avg={early_avg:.4f} → 최근 avg={late_avg:.4f}")
|
| 155 |
+
sys.exit(0)
|
| 156 |
+
else:
|
| 157 |
+
print(f"[FAIL] loss 감소 미확인: 초기 avg={early_avg:.4f}, 최근 avg={late_avg:.4f}")
|
| 158 |
+
sys.exit(1)
|
| 159 |
+
PYEOF
|
| 160 |
+
local mono_exit=$?
|
| 161 |
+
if [[ $mono_exit -eq 0 ]]; then
|
| 162 |
+
GATE_PASS=$((GATE_PASS + 1))
|
| 163 |
+
elif [[ $mono_exit -eq 1 ]]; then
|
| 164 |
+
GATE_FAIL=$((GATE_FAIL + 1))
|
| 165 |
+
fi
|
| 166 |
+
# exit 0 (SKIP) 는 이미 처리됨
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
# =============================================================================
|
| 170 |
+
# Gate 2: SFT
|
| 171 |
+
# =============================================================================
|
| 172 |
+
gate_sft() {
|
| 173 |
+
echo ""
|
| 174 |
+
echo "=================================================================="
|
| 175 |
+
echo " Gate: SFT"
|
| 176 |
+
echo " 기준: val_loss 수렴 | 반복률 < 15% | KoBEST > 55%"
|
| 177 |
+
echo "=================================================================="
|
| 178 |
+
|
| 179 |
+
METRICS_FILE=""
|
| 180 |
+
for candidate in \
|
| 181 |
+
"$PROJECT_DIR/outputs/sft_metrics.json" \
|
| 182 |
+
"$PROJECT_DIR/logs/sft_metrics.json" \
|
| 183 |
+
"$PROJECT_DIR/checkpoints/sft/metrics.json"
|
| 184 |
+
do
|
| 185 |
+
if [[ -f "$candidate" ]]; then
|
| 186 |
+
METRICS_FILE="$candidate"
|
| 187 |
+
break
|
| 188 |
+
fi
|
| 189 |
+
done
|
| 190 |
+
|
| 191 |
+
if [[ -z "$METRICS_FILE" ]]; then
|
| 192 |
+
log_warn "SFT 메트릭 파일을 찾을 수 없습니다."
|
| 193 |
+
log_warn ' {"val_loss": 1.8, "rep_rate": 0.08, "kobest_score": 0.62}'
|
| 194 |
+
record_skip "SFT 메트릭 파일 없음 — 게이트 건너뜀"
|
| 195 |
+
return 0
|
| 196 |
+
fi
|
| 197 |
+
|
| 198 |
+
log_info "메트릭 파일: $METRICS_FILE"
|
| 199 |
+
|
| 200 |
+
# val_loss 수렴 (상대 변화율 < 1% — 마지막 두 체크포인트)
|
| 201 |
+
python3 - "$METRICS_FILE" <<'PYEOF'
|
| 202 |
+
import json, sys
|
| 203 |
+
|
| 204 |
+
metrics_file = sys.argv[1]
|
| 205 |
+
try:
|
| 206 |
+
d = json.load(open(metrics_file))
|
| 207 |
+
history = d.get("val_loss_history", [])
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"[SKIP] val_loss_history 읽기 실패: {e}")
|
| 210 |
+
sys.exit(0)
|
| 211 |
+
|
| 212 |
+
if len(history) < 2:
|
| 213 |
+
# 단일 val_loss만 있으면 단순 확인
|
| 214 |
+
val_loss = d.get("val_loss")
|
| 215 |
+
if val_loss is not None:
|
| 216 |
+
print(f"[INFO] val_loss = {val_loss} (수렴 히스토리 없음 — 단일 값 확인 건너뜀)")
|
| 217 |
+
sys.exit(0)
|
| 218 |
+
|
| 219 |
+
last = history[-1]
|
| 220 |
+
second = history[-2]
|
| 221 |
+
rel_change = abs(last - second) / max(abs(second), 1e-9)
|
| 222 |
+
|
| 223 |
+
if rel_change < 0.01:
|
| 224 |
+
print(f"[PASS] val_loss 수렴 (상대변화율 {rel_change*100:.3f}% < 1%): {second:.4f} → {last:.4f}")
|
| 225 |
+
sys.exit(0)
|
| 226 |
+
else:
|
| 227 |
+
print(f"[FAIL] val_loss 미수렴 (상대변화율 {rel_change*100:.3f}% >= 1%): {second:.4f} → {last:.4f}")
|
| 228 |
+
sys.exit(1)
|
| 229 |
+
PYEOF
|
| 230 |
+
local conv_exit=$?
|
| 231 |
+
[[ $conv_exit -eq 0 ]] && GATE_PASS=$((GATE_PASS + 1)) || GATE_FAIL=$((GATE_FAIL + 1))
|
| 232 |
+
|
| 233 |
+
# 반복률 확인
|
| 234 |
+
REP_RATE=$(json_get "$METRICS_FILE" "rep_rate" 2>/dev/null || echo "NOT_FOUND")
|
| 235 |
+
if [[ "$REP_RATE" == "NOT_FOUND" ]]; then
|
| 236 |
+
record_skip "rep_rate 키 없음 — 건너뜀"
|
| 237 |
+
else
|
| 238 |
+
REP_PCT=$(py_value "$REP_RATE * 100")
|
| 239 |
+
log_info "반복률 = ${REP_PCT}% (기준: < 15%)"
|
| 240 |
+
if py_eval "$REP_RATE < 0.15" 2>/dev/null; then
|
| 241 |
+
record_pass "반복률 ${REP_PCT}% < 15%"
|
| 242 |
+
else
|
| 243 |
+
record_fail "반복률 ${REP_PCT}% >= 15% (기준 미달)"
|
| 244 |
+
fi
|
| 245 |
+
fi
|
| 246 |
+
|
| 247 |
+
# KoBEST 확인
|
| 248 |
+
KOBEST=$(json_get "$METRICS_FILE" "kobest_score" 2>/dev/null || echo "NOT_FOUND")
|
| 249 |
+
if [[ "$KOBEST" == "NOT_FOUND" ]]; then
|
| 250 |
+
record_skip "kobest_score 키 없음 — 건너뜀"
|
| 251 |
+
else
|
| 252 |
+
KOBEST_PCT=$(py_value "$KOBEST * 100")
|
| 253 |
+
log_info "KoBEST = ${KOBEST_PCT}% (기준: > 55%)"
|
| 254 |
+
if py_eval "$KOBEST > 0.55" 2>/dev/null; then
|
| 255 |
+
record_pass "KoBEST ${KOBEST_PCT}% > 55%"
|
| 256 |
+
else
|
| 257 |
+
record_fail "KoBEST ${KOBEST_PCT}% <= 55% (기준 미달)"
|
| 258 |
+
fi
|
| 259 |
+
fi
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
# =============================================================================
|
| 263 |
+
# Gate 3: ORPO
|
| 264 |
+
# =============================================================================
|
| 265 |
+
gate_orpo() {
|
| 266 |
+
echo ""
|
| 267 |
+
echo "=================================================================="
|
| 268 |
+
echo " Gate: ORPO"
|
| 269 |
+
echo " 기준: 반복률 < 5% | KoBEST > 60% | chosen > rejected 90%+"
|
| 270 |
+
echo "=================================================================="
|
| 271 |
+
|
| 272 |
+
METRICS_FILE=""
|
| 273 |
+
for candidate in \
|
| 274 |
+
"$PROJECT_DIR/outputs/orpo_metrics.json" \
|
| 275 |
+
"$PROJECT_DIR/logs/orpo_metrics.json" \
|
| 276 |
+
"$PROJECT_DIR/checkpoints/orpo/metrics.json"
|
| 277 |
+
do
|
| 278 |
+
if [[ -f "$candidate" ]]; then
|
| 279 |
+
METRICS_FILE="$candidate"
|
| 280 |
+
break
|
| 281 |
+
fi
|
| 282 |
+
done
|
| 283 |
+
|
| 284 |
+
if [[ -z "$METRICS_FILE" ]]; then
|
| 285 |
+
log_warn "ORPO 메트릭 파일을 찾을 수 없습니다."
|
| 286 |
+
log_warn ' {"rep_rate": 0.03, "kobest_score": 0.63, "chosen_win_rate": 0.92}'
|
| 287 |
+
record_skip "ORPO 메트릭 파일 없음 — 게이트 건너뜀"
|
| 288 |
+
return 0
|
| 289 |
+
fi
|
| 290 |
+
|
| 291 |
+
log_info "메트릭 파일: $METRICS_FILE"
|
| 292 |
+
|
| 293 |
+
# 반복률 (더 엄격: < 5%)
|
| 294 |
+
REP_RATE=$(json_get "$METRICS_FILE" "rep_rate" 2>/dev/null || echo "NOT_FOUND")
|
| 295 |
+
if [[ "$REP_RATE" == "NOT_FOUND" ]]; then
|
| 296 |
+
record_skip "rep_rate 키 없음 — 건너뜀"
|
| 297 |
+
else
|
| 298 |
+
REP_PCT=$(py_value "$REP_RATE * 100")
|
| 299 |
+
log_info "반복률 = ${REP_PCT}% (기준: < 5%)"
|
| 300 |
+
if py_eval "$REP_RATE < 0.05" 2>/dev/null; then
|
| 301 |
+
record_pass "반복률 ${REP_PCT}% < 5%"
|
| 302 |
+
else
|
| 303 |
+
record_fail "반복률 ${REP_PCT}% >= 5% (기준 미달)"
|
| 304 |
+
fi
|
| 305 |
+
fi
|
| 306 |
+
|
| 307 |
+
# KoBEST (더 엄격: > 60%)
|
| 308 |
+
KOBEST=$(json_get "$METRICS_FILE" "kobest_score" 2>/dev/null || echo "NOT_FOUND")
|
| 309 |
+
if [[ "$KOBEST" == "NOT_FOUND" ]]; then
|
| 310 |
+
record_skip "kobest_score 키 없음 — 건너뜀"
|
| 311 |
+
else
|
| 312 |
+
KOBEST_PCT=$(py_value "$KOBEST * 100")
|
| 313 |
+
log_info "KoBEST = ${KOBEST_PCT}% (기준: > 60%)"
|
| 314 |
+
if py_eval "$KOBEST > 0.60" 2>/dev/null; then
|
| 315 |
+
record_pass "KoBEST ${KOBEST_PCT}% > 60%"
|
| 316 |
+
else
|
| 317 |
+
record_fail "KoBEST ${KOBEST_PCT}% <= 60% (기준 미달)"
|
| 318 |
+
fi
|
| 319 |
+
fi
|
| 320 |
+
|
| 321 |
+
# Chosen win rate (chosen log-prob > rejected log-prob 비율)
|
| 322 |
+
CHOSEN_WIN=$(json_get "$METRICS_FILE" "chosen_win_rate" 2>/dev/null || echo "NOT_FOUND")
|
| 323 |
+
if [[ "$CHOSEN_WIN" == "NOT_FOUND" ]]; then
|
| 324 |
+
record_skip "chosen_win_rate 키 없음 — 건너뜀"
|
| 325 |
+
else
|
| 326 |
+
WIN_PCT=$(py_value "$CHOSEN_WIN * 100")
|
| 327 |
+
log_info "Chosen win rate = ${WIN_PCT}% (기준: >= 90%)"
|
| 328 |
+
if py_eval "$CHOSEN_WIN >= 0.90" 2>/dev/null; then
|
| 329 |
+
record_pass "Chosen win rate ${WIN_PCT}% >= 90%"
|
| 330 |
+
else
|
| 331 |
+
record_fail "Chosen win rate ${WIN_PCT}% < 90% (기준 미달)"
|
| 332 |
+
fi
|
| 333 |
+
fi
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
# =============================================================================
|
| 337 |
+
# Gate 4: Deploy
|
| 338 |
+
# =============================================================================
|
| 339 |
+
gate_deploy() {
|
| 340 |
+
echo ""
|
| 341 |
+
echo "=================================================================="
|
| 342 |
+
echo " Gate: DEPLOY"
|
| 343 |
+
echo " 기준: Q4_K_M perplexity < F16 × 1.05 | Ollama 5개 프롬프트 응답"
|
| 344 |
+
echo "=================================================================="
|
| 345 |
+
|
| 346 |
+
local MODEL_NAME="frankenstallm-3b"
|
| 347 |
+
local GGUF_DIR="$PROJECT_DIR/outputs/gguf"
|
| 348 |
+
local F16_GGUF="$GGUF_DIR/${MODEL_NAME}-f16.gguf"
|
| 349 |
+
local Q4KM_GGUF="$GGUF_DIR/${MODEL_NAME}-Q4_K_M.gguf"
|
| 350 |
+
|
| 351 |
+
# --- GGUF 파일 존재 확인 ---
|
| 352 |
+
if [[ ! -f "$Q4KM_GGUF" ]]; then
|
| 353 |
+
log_warn "Q4_K_M GGUF 파일 없음: $Q4KM_GGUF"
|
| 354 |
+
log_warn "먼저 실행: bash scripts/convert_3b_gguf.sh"
|
| 355 |
+
record_skip "GGUF 파일 없음 — perplexity 게이트 건너뜀"
|
| 356 |
+
else
|
| 357 |
+
# perplexity 측정 (llama-perplexity 또는 Python fallback)
|
| 358 |
+
LLAMA_PPL_BIN="$PROJECT_DIR/outputs/llama.cpp/build/bin/llama-perplexity"
|
| 359 |
+
|
| 360 |
+
if [[ ! -f "$LLAMA_PPL_BIN" ]]; then
|
| 361 |
+
log_warn "llama-perplexity 바이너리 없음 — 빌드 시도 중 ..."
|
| 362 |
+
cmake --build "$PROJECT_DIR/outputs/llama.cpp/build" \
|
| 363 |
+
--target llama-perplexity -j "$(nproc)" &>/dev/null || true
|
| 364 |
+
fi
|
| 365 |
+
|
| 366 |
+
# 샘플 텍스트로 perplexity 비교
|
| 367 |
+
SAMPLE_TEXT="$PROJECT_DIR/outputs/gguf/ppl_sample.txt"
|
| 368 |
+
if [[ ! -f "$SAMPLE_TEXT" ]]; then
|
| 369 |
+
# 짧은 한국어 샘플 생성
|
| 370 |
+
cat > "$SAMPLE_TEXT" <<'SAMPLE'
|
| 371 |
+
인공지능은 현대 사회에서 매우 중요한 기술로 자리잡고 있습니다.
|
| 372 |
+
기계 학습과 딥러닝의 발전으로 인해 다양한 분야에서 혁신이 이루어지고 있습니다.
|
| 373 |
+
자연어 처리 기술의 발전은 인간과 컴퓨터의 상호작용 방식을 근본적으로 변화시키고 있습니다.
|
| 374 |
+
한국어는 교착어로서 특유의 형태론적 특성을 가지고 있어 자연어 처리에 독특한 도전을 제시합니다.
|
| 375 |
+
대규모 언어 모델의 등장으로 기계 번역, 텍스트 요약, 질의응답 등의 성능이 크게 향상되었습니다.
|
| 376 |
+
SAMPLE
|
| 377 |
+
fi
|
| 378 |
+
|
| 379 |
+
if [[ -f "$LLAMA_PPL_BIN" && -f "$F16_GGUF" ]]; then
|
| 380 |
+
log_info "Perplexity 측정 중 (F16 vs Q4_K_M) ..."
|
| 381 |
+
|
| 382 |
+
PPL_F16=$(timeout 120 "$LLAMA_PPL_BIN" -m "$F16_GGUF" -f "$SAMPLE_TEXT" 2>&1 \
|
| 383 |
+
| grep -oP "Perplexity: \K[0-9.]+" | head -1 || echo "0")
|
| 384 |
+
PPL_Q4=$(timeout 120 "$LLAMA_PPL_BIN" -m "$Q4KM_GGUF" -f "$SAMPLE_TEXT" 2>&1 \
|
| 385 |
+
| grep -oP "Perplexity: \K[0-9.]+" | head -1 || echo "0")
|
| 386 |
+
|
| 387 |
+
if [[ "$PPL_F16" == "0" || "$PPL_Q4" == "0" ]]; then
|
| 388 |
+
record_skip "Perplexity 측정 실패 — 건너뜀"
|
| 389 |
+
else
|
| 390 |
+
THRESHOLD=$(py_value "$PPL_F16 * 1.05")
|
| 391 |
+
log_info "F16 PPL = $PPL_F16 | Q4_K_M PPL = $PPL_Q4 | 기준: < $THRESHOLD"
|
| 392 |
+
if py_eval "$PPL_Q4 < $PPL_F16 * 1.05" 2>/dev/null; then
|
| 393 |
+
record_pass "Q4_K_M PPL $PPL_Q4 < F16 PPL × 1.05 ($THRESHOLD)"
|
| 394 |
+
else
|
| 395 |
+
record_fail "Q4_K_M PPL $PPL_Q4 >= F16 PPL × 1.05 ($THRESHOLD)"
|
| 396 |
+
fi
|
| 397 |
+
fi
|
| 398 |
+
else
|
| 399 |
+
record_skip "llama-perplexity 또는 F16 GGUF 없음 — perplexity 게이트 건너뜀"
|
| 400 |
+
fi
|
| 401 |
+
fi
|
| 402 |
+
|
| 403 |
+
# --- Ollama 응답 테스트 ---
|
| 404 |
+
if ! command -v ollama &>/dev/null; then
|
| 405 |
+
record_skip "ollama 없음 — 응답 테스트 건너뜀"
|
| 406 |
+
return 0
|
| 407 |
+
fi
|
| 408 |
+
|
| 409 |
+
if ! ollama list 2>/dev/null | grep -q "$MODEL_NAME"; then
|
| 410 |
+
log_warn "Ollama에 $MODEL_NAME 모델이 등록되지 않았습니다."
|
| 411 |
+
log_warn "먼저 실행: bash scripts/deploy_3b_ollama.sh"
|
| 412 |
+
record_skip "Ollama 모델 미등록 — 응답 테스트 건너뜀"
|
| 413 |
+
return 0
|
| 414 |
+
fi
|
| 415 |
+
|
| 416 |
+
log_info "Ollama 응답 테스트 (5개 프롬프트) ..."
|
| 417 |
+
|
| 418 |
+
declare -a PROMPTS=(
|
| 419 |
+
"안녕하세요."
|
| 420 |
+
"1 더하기 1은 무엇인가요?"
|
| 421 |
+
"파이썬이란 무엇인가요?"
|
| 422 |
+
"한국의 수도는 어디인가요?"
|
| 423 |
+
"오늘 날씨가 좋네요."
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
local PASS=0 FAIL=0
|
| 427 |
+
for i in "${!PROMPTS[@]}"; do
|
| 428 |
+
local PROMPT="${PROMPTS[$i]}"
|
| 429 |
+
local NUM=$((i + 1))
|
| 430 |
+
if RESP=$(timeout 45 ollama run "$MODEL_NAME" "$PROMPT" 2>&1) && [[ -n "$RESP" ]]; then
|
| 431 |
+
log_ok " 프롬프트 $NUM 응답 OK (${#RESP}자)"
|
| 432 |
+
PASS=$((PASS + 1))
|
| 433 |
+
else
|
| 434 |
+
log_fail " 프롬프트 $NUM 응답 실패"
|
| 435 |
+
FAIL=$((FAIL + 1))
|
| 436 |
+
fi
|
| 437 |
+
done
|
| 438 |
+
|
| 439 |
+
log_info "Ollama 응답: $PASS/5 성공"
|
| 440 |
+
if [[ $FAIL -eq 0 ]]; then
|
| 441 |
+
record_pass "Ollama 5개 프롬프트 모두 응답 성공"
|
| 442 |
+
else
|
| 443 |
+
record_fail "Ollama 응답 실패 $FAIL/5"
|
| 444 |
+
fi
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
# =============================================================================
|
| 448 |
+
# 최종 요약 출력
|
| 449 |
+
# =============================================================================
|
| 450 |
+
print_summary() {
|
| 451 |
+
local phase="$1"
|
| 452 |
+
local TOTAL=$((GATE_PASS + GATE_FAIL + GATE_SKIP))
|
| 453 |
+
echo ""
|
| 454 |
+
echo "=================================================================="
|
| 455 |
+
echo " Quality Gate 결과: $phase"
|
| 456 |
+
echo " PASS: $GATE_PASS | FAIL: $GATE_FAIL | SKIP: $GATE_SKIP | TOTAL: $TOTAL"
|
| 457 |
+
echo "=================================================================="
|
| 458 |
+
|
| 459 |
+
if [[ $GATE_FAIL -eq 0 ]]; then
|
| 460 |
+
echo -e "${_GREEN} [GATE PASSED]${_NC} 모든 검증 기준 통과"
|
| 461 |
+
echo ""
|
| 462 |
+
return 0
|
| 463 |
+
else
|
| 464 |
+
echo -e "${_RED} [GATE FAILED]${_NC} ${GATE_FAIL}개 검증 기준 미달"
|
| 465 |
+
echo " 실패 항목을 수정한 후 다시 실행하세요."
|
| 466 |
+
echo ""
|
| 467 |
+
return 1
|
| 468 |
+
fi
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
# =============================================================================
|
| 472 |
+
# 진입점
|
| 473 |
+
# =============================================================================
|
| 474 |
+
PHASE="${1:-}"
|
| 475 |
+
|
| 476 |
+
if [[ -z "$PHASE" ]]; then
|
| 477 |
+
echo "Usage: bash scripts/quality_gate.sh <phase>"
|
| 478 |
+
echo " phase: pretrain | sft | orpo | deploy | all"
|
| 479 |
+
exit 2
|
| 480 |
+
fi
|
| 481 |
+
|
| 482 |
+
echo ""
|
| 483 |
+
echo "=================================================================="
|
| 484 |
+
echo " Quality Gate 검증 시작: $PHASE"
|
| 485 |
+
echo " 프로젝트: $PROJECT_DIR"
|
| 486 |
+
echo " 시각 : $(date '+%Y-%m-%d %H:%M:%S')"
|
| 487 |
+
echo "=================================================================="
|
| 488 |
+
|
| 489 |
+
case "$PHASE" in
|
| 490 |
+
pretrain)
|
| 491 |
+
gate_pretrain
|
| 492 |
+
print_summary "pretrain"
|
| 493 |
+
;;
|
| 494 |
+
sft)
|
| 495 |
+
gate_sft
|
| 496 |
+
print_summary "sft"
|
| 497 |
+
;;
|
| 498 |
+
orpo)
|
| 499 |
+
gate_orpo
|
| 500 |
+
print_summary "orpo"
|
| 501 |
+
;;
|
| 502 |
+
deploy)
|
| 503 |
+
gate_deploy
|
| 504 |
+
print_summary "deploy"
|
| 505 |
+
;;
|
| 506 |
+
all)
|
| 507 |
+
gate_pretrain
|
| 508 |
+
gate_sft
|
| 509 |
+
gate_orpo
|
| 510 |
+
gate_deploy
|
| 511 |
+
print_summary "all"
|
| 512 |
+
;;
|
| 513 |
+
*)
|
| 514 |
+
echo "ERROR: 알 수 없는 phase: $PHASE"
|
| 515 |
+
echo "Usage: bash scripts/quality_gate.sh <pretrain|sft|orpo|deploy|all>"
|
| 516 |
+
exit 2
|
| 517 |
+
;;
|
| 518 |
+
esac
|
source/scripts/run_eval.sh
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Usage: bash scripts/run_eval.sh <checkpoint_dir>
|
| 3 |
+
# Example: bash scripts/run_eval.sh checkpoints/korean_1b_fp8_run1/checkpoint-0200000
|
| 4 |
+
set -euo pipefail
|
| 5 |
+
|
| 6 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 7 |
+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
| 8 |
+
|
| 9 |
+
CHECKPOINT="${1:?Usage: bash scripts/run_eval.sh <checkpoint_dir>}"
|
| 10 |
+
|
| 11 |
+
echo "=== Perplexity Evaluation ==="
|
| 12 |
+
python "$PROJECT_DIR/eval/perplexity.py" \
|
| 13 |
+
--checkpoint "$CHECKPOINT" \
|
| 14 |
+
--data "$PROJECT_DIR/data/korean_val.bin" \
|
| 15 |
+
--device cuda:0
|
| 16 |
+
|
| 17 |
+
echo ""
|
| 18 |
+
echo "=== Text Generation ==="
|
| 19 |
+
python "$PROJECT_DIR/eval/generate.py" \
|
| 20 |
+
--checkpoint "$CHECKPOINT" \
|
| 21 |
+
--prompt "안녕하세요, 저는" \
|
| 22 |
+
--max_new_tokens 200 \
|
| 23 |
+
--device cuda:0
|
source/scripts/run_eval_full.sh
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# ============================================================
|
| 3 |
+
# run_eval_full.sh — 전체 한국어 벤치마크 평가 (목표: 1.5-3시간)
|
| 4 |
+
#
|
| 5 |
+
# 사용법:
|
| 6 |
+
# bash scripts/run_eval_full.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
|
| 7 |
+
#
|
| 8 |
+
# 예시:
|
| 9 |
+
# bash scripts/run_eval_full.sh \
|
| 10 |
+
# checkpoints/korean_1b_sft/checkpoint-0005000 \
|
| 11 |
+
# eval/outputs/full_5000
|
| 12 |
+
#
|
| 13 |
+
# 태스크:
|
| 14 |
+
# - KoBEST (5): boolq, copa, hellaswag, sentineg, wic
|
| 15 |
+
# - HAE-RAE Bench (5): general_knowledge, history, loan_word, rare_word, standard_nomenclature
|
| 16 |
+
# - Global MMLU Korean: 57개 도메인
|
| 17 |
+
# - PAWS-Ko: 패러프레이즈 탐지
|
| 18 |
+
# - KorMedMCQA: 한국어 의학 MCQ (선택)
|
| 19 |
+
#
|
| 20 |
+
# 총 예상 샘플: ~15,000개
|
| 21 |
+
# 1B 모델 @ 8×B200 기준: 약 1.5-3시간
|
| 22 |
+
# ============================================================
|
| 23 |
+
set -euo pipefail
|
| 24 |
+
|
| 25 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 26 |
+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
| 27 |
+
|
| 28 |
+
# ─── 인자 처리 ────────────────────────────────────────────
|
| 29 |
+
CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
|
| 30 |
+
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
|
| 31 |
+
OUTPUT_DIR="${2:-eval/outputs/full_${TIMESTAMP}}"
|
| 32 |
+
|
| 33 |
+
[[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
|
| 34 |
+
[[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
|
| 35 |
+
|
| 36 |
+
# ─── 설정 ────────────────────────────────────────────────
|
| 37 |
+
HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
|
| 38 |
+
TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
|
| 39 |
+
|
| 40 |
+
# GPU 설정: 단일 GPU 또는 tensor parallel
|
| 41 |
+
# lm-eval의 hf backend는 기본 단일 GPU 사용
|
| 42 |
+
# 멀티 GPU: --model_args "pretrained=...,parallelize=True" (자동 device_map)
|
| 43 |
+
USE_MULTI_GPU="${USE_MULTI_GPU:-0}"
|
| 44 |
+
if [ "$USE_MULTI_GPU" = "1" ]; then
|
| 45 |
+
MODEL_EXTRA_ARGS=",parallelize=True"
|
| 46 |
+
echo "▶ 멀티 GPU 모드 활성화 (device_map=auto)"
|
| 47 |
+
else
|
| 48 |
+
MODEL_EXTRA_ARGS=""
|
| 49 |
+
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
BATCH_SIZE="${BATCH_SIZE:-auto}"
|
| 53 |
+
NUM_FEWSHOT="${NUM_FEWSHOT:-0}"
|
| 54 |
+
|
| 55 |
+
# ─── 태스크 정의 ─────────────────────────────────────────
|
| 56 |
+
# Core Korean tasks (항상 실행)
|
| 57 |
+
TASKS_CORE="kobest,haerae,paws_ko"
|
| 58 |
+
|
| 59 |
+
# Extended tasks (시간 있을 때)
|
| 60 |
+
TASKS_EXTENDED="global_mmlu_ko"
|
| 61 |
+
|
| 62 |
+
# 선택적 태스크
|
| 63 |
+
TASKS_OPTIONAL="kormedmcqa" # 한국어 의학 MCQ
|
| 64 |
+
|
| 65 |
+
# 전체 실행 태스크
|
| 66 |
+
TASKS="${TASKS_CORE},${TASKS_EXTENDED}"
|
| 67 |
+
|
| 68 |
+
# ─── 의존성 확인 ─────────────────────────────────────────
|
| 69 |
+
check_dep() {
|
| 70 |
+
python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
|
| 71 |
+
}
|
| 72 |
+
check_dep lm_eval lm-eval
|
| 73 |
+
check_dep transformers transformers
|
| 74 |
+
check_dep safetensors safetensors
|
| 75 |
+
|
| 76 |
+
echo "=================================================="
|
| 77 |
+
echo " Ko-LLM Full Benchmark Evaluation"
|
| 78 |
+
echo "=================================================="
|
| 79 |
+
echo " Checkpoint : $CHECKPOINT"
|
| 80 |
+
echo " HF output : $HF_MODEL_DIR"
|
| 81 |
+
echo " Tasks : $TASKS"
|
| 82 |
+
echo " Few-shot : $NUM_FEWSHOT"
|
| 83 |
+
echo " Batch size : $BATCH_SIZE"
|
| 84 |
+
echo " Output : $OUTPUT_DIR"
|
| 85 |
+
echo " Multi-GPU : $USE_MULTI_GPU"
|
| 86 |
+
echo " Start time : $(date)"
|
| 87 |
+
echo "=================================================="
|
| 88 |
+
|
| 89 |
+
mkdir -p "$OUTPUT_DIR"
|
| 90 |
+
LOG_FILE="$OUTPUT_DIR/eval_full.log"
|
| 91 |
+
|
| 92 |
+
# ─── Step 1: HF 포맷 변환 ───────────────────────────────
|
| 93 |
+
echo ""
|
| 94 |
+
echo "▶ [1/3] 커스텀 체크포인트 → HF 포맷 변환..."
|
| 95 |
+
|
| 96 |
+
if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
|
| 97 |
+
python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
|
| 98 |
+
--checkpoint "$CHECKPOINT" \
|
| 99 |
+
--output "$HF_MODEL_DIR" \
|
| 100 |
+
--tokenizer "$TOKENIZER" \
|
| 101 |
+
2>&1 | tee -a "$LOG_FILE"
|
| 102 |
+
echo "✅ HF 변환 완료: $HF_MODEL_DIR"
|
| 103 |
+
else
|
| 104 |
+
echo " ↳ HF 모델 이미 존재, 변환 스킵: $HF_MODEL_DIR"
|
| 105 |
+
fi
|
| 106 |
+
|
| 107 |
+
# ─── Step 2: 전체 평가 ──────────────────────────────────
|
| 108 |
+
echo ""
|
| 109 |
+
echo "▶ [2/3] lm-eval 전체 평가 시작..."
|
| 110 |
+
echo " ↳ 로그: $LOG_FILE"
|
| 111 |
+
START_TIME=$(date +%s)
|
| 112 |
+
|
| 113 |
+
if [ "$USE_MULTI_GPU" = "1" ]; then
|
| 114 |
+
python3 -m lm_eval \
|
| 115 |
+
--model hf \
|
| 116 |
+
--model_args "pretrained=$HF_MODEL_DIR,dtype=float16,parallelize=True" \
|
| 117 |
+
--tasks "$TASKS" \
|
| 118 |
+
--num_fewshot "$NUM_FEWSHOT" \
|
| 119 |
+
--batch_size "$BATCH_SIZE" \
|
| 120 |
+
--output_path "$OUTPUT_DIR" \
|
| 121 |
+
--log_samples \
|
| 122 |
+
--verbosity INFO \
|
| 123 |
+
2>&1 | tee -a "$LOG_FILE"
|
| 124 |
+
else
|
| 125 |
+
CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" python3 -m lm_eval \
|
| 126 |
+
--model hf \
|
| 127 |
+
--model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
|
| 128 |
+
--tasks "$TASKS" \
|
| 129 |
+
--num_fewshot "$NUM_FEWSHOT" \
|
| 130 |
+
--batch_size "$BATCH_SIZE" \
|
| 131 |
+
--output_path "$OUTPUT_DIR" \
|
| 132 |
+
--log_samples \
|
| 133 |
+
--verbosity INFO \
|
| 134 |
+
2>&1 | tee -a "$LOG_FILE"
|
| 135 |
+
fi
|
| 136 |
+
|
| 137 |
+
END_TIME=$(date +%s)
|
| 138 |
+
ELAPSED=$(( END_TIME - START_TIME ))
|
| 139 |
+
echo ""
|
| 140 |
+
echo "✅ 평가 완료! 소요: $((ELAPSED/60))분 $((ELAPSED%60))초"
|
| 141 |
+
|
| 142 |
+
# ─── Step 3: 결과 요약 리포트 생성 ─────────────────────
|
| 143 |
+
echo ""
|
| 144 |
+
echo "▶ [3/3] 결과 리포트 생성..."
|
| 145 |
+
|
| 146 |
+
python3 - "$OUTPUT_DIR" "$CHECKPOINT" <<'PYEOF'
|
| 147 |
+
import json, glob, sys, os
|
| 148 |
+
from datetime import datetime
|
| 149 |
+
|
| 150 |
+
output_dir = sys.argv[1]
|
| 151 |
+
checkpoint = sys.argv[2] if len(sys.argv) > 2 else "unknown"
|
| 152 |
+
|
| 153 |
+
results_files = sorted(glob.glob(f"{output_dir}/**/*.json", recursive=True))
|
| 154 |
+
results_files = [f for f in results_files if "samples_" not in os.path.basename(f)]
|
| 155 |
+
|
| 156 |
+
report_lines = [
|
| 157 |
+
f"# Ko-LLM Full Eval Report",
|
| 158 |
+
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
| 159 |
+
f"Checkpoint: {checkpoint}",
|
| 160 |
+
"",
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
all_results = {}
|
| 164 |
+
for rf in results_files:
|
| 165 |
+
try:
|
| 166 |
+
with open(rf) as f:
|
| 167 |
+
data = json.load(f)
|
| 168 |
+
results = data.get("results", {})
|
| 169 |
+
if results:
|
| 170 |
+
all_results.update(results)
|
| 171 |
+
except Exception:
|
| 172 |
+
pass
|
| 173 |
+
|
| 174 |
+
# KoBEST 요약
|
| 175 |
+
kobest_tasks = [k for k in all_results if k.startswith("kobest_")]
|
| 176 |
+
if kobest_tasks:
|
| 177 |
+
report_lines.append("## KoBEST")
|
| 178 |
+
report_lines.append("| Task | Metric | Score |")
|
| 179 |
+
report_lines.append("|------|--------|-------|")
|
| 180 |
+
for task in sorted(kobest_tasks):
|
| 181 |
+
metrics = all_results[task]
|
| 182 |
+
for key, val in metrics.items():
|
| 183 |
+
if "stderr" not in key and isinstance(val, (int, float)):
|
| 184 |
+
report_lines.append(f"| {task} | {key} | {val:.4f} |")
|
| 185 |
+
|
| 186 |
+
# HAE-RAE 요약
|
| 187 |
+
haerae_tasks = [k for k in all_results if k.startswith("haerae")]
|
| 188 |
+
if haerae_tasks:
|
| 189 |
+
report_lines.append("\n## HAE-RAE Bench")
|
| 190 |
+
report_lines.append("| Task | Metric | Score |")
|
| 191 |
+
report_lines.append("|------|--------|-------|")
|
| 192 |
+
for task in sorted(haerae_tasks):
|
| 193 |
+
metrics = all_results[task]
|
| 194 |
+
for key, val in metrics.items():
|
| 195 |
+
if "stderr" not in key and isinstance(val, (int, float)):
|
| 196 |
+
report_lines.append(f"| {task} | {key} | {val:.4f} |")
|
| 197 |
+
|
| 198 |
+
# MMLU Ko 요약 (상위 레벨만)
|
| 199 |
+
mmlu_top = {k: v for k, v in all_results.items()
|
| 200 |
+
if k.startswith("global_mmlu_ko") and "_" not in k.replace("global_mmlu_ko", "")}
|
| 201 |
+
if mmlu_top:
|
| 202 |
+
report_lines.append("\n## Global MMLU (Korean)")
|
| 203 |
+
for task, metrics in mmlu_top.items():
|
| 204 |
+
for key, val in metrics.items():
|
| 205 |
+
if "stderr" not in key and isinstance(val, (int, float)):
|
| 206 |
+
report_lines.append(f"- {task} {key}: {val:.4f}")
|
| 207 |
+
|
| 208 |
+
# 기타
|
| 209 |
+
other_tasks = [k for k in all_results
|
| 210 |
+
if not k.startswith("kobest_")
|
| 211 |
+
and not k.startswith("haerae")
|
| 212 |
+
and not k.startswith("global_mmlu_ko")]
|
| 213 |
+
if other_tasks:
|
| 214 |
+
report_lines.append("\n## 기타 태스크")
|
| 215 |
+
for task in sorted(other_tasks):
|
| 216 |
+
metrics = all_results[task]
|
| 217 |
+
for key, val in metrics.items():
|
| 218 |
+
if "stderr" not in key and isinstance(val, (int, float)):
|
| 219 |
+
report_lines.append(f"- {task} | {key}: {val:.4f}")
|
| 220 |
+
|
| 221 |
+
report_path = os.path.join(output_dir, "SUMMARY.md")
|
| 222 |
+
with open(report_path, "w") as f:
|
| 223 |
+
f.write("\n".join(report_lines))
|
| 224 |
+
|
| 225 |
+
print("\n".join(report_lines))
|
| 226 |
+
print(f"\n📄 리포트 저장: {report_path}")
|
| 227 |
+
PYEOF
|
| 228 |
+
|
| 229 |
+
echo ""
|
| 230 |
+
echo "=================================================="
|
| 231 |
+
echo "✅ 전체 평가 완료!"
|
| 232 |
+
echo " 결과 디렉토리: $OUTPUT_DIR"
|
| 233 |
+
echo " 요약 리포트 : $OUTPUT_DIR/SUMMARY.md"
|
| 234 |
+
echo " 전체 로그 : $LOG_FILE"
|
| 235 |
+
echo " 완료 시각 : $(date)"
|
| 236 |
+
echo "=================================================="
|
source/scripts/run_eval_quick.sh
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# ============================================================
|
| 3 |
+
# run_eval_quick.sh — 빠른 평가 체크 (목표: 20-30분)
|
| 4 |
+
#
|
| 5 |
+
# 사용법:
|
| 6 |
+
# bash scripts/run_eval_quick.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
|
| 7 |
+
#
|
| 8 |
+
# 예시:
|
| 9 |
+
# bash scripts/run_eval_quick.sh \
|
| 10 |
+
# checkpoints/korean_1b_sft/checkpoint-0005000 \
|
| 11 |
+
# eval/outputs/quick_5000
|
| 12 |
+
#
|
| 13 |
+
# 태스크: kobest_boolq, kobest_copa, haerae_general_knowledge,
|
| 14 |
+
# haerae_history, paws_ko
|
| 15 |
+
# ============================================================
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 19 |
+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
| 20 |
+
|
| 21 |
+
# ─── 인자 처리 ────────────────────────────────────────────
|
| 22 |
+
CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
|
| 23 |
+
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
|
| 24 |
+
OUTPUT_DIR="${2:-eval/outputs/quick_${TIMESTAMP}}"
|
| 25 |
+
|
| 26 |
+
# 상대 경로 → 절대 경로
|
| 27 |
+
[[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
|
| 28 |
+
[[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
|
| 29 |
+
|
| 30 |
+
# ─── 설정 ────────────────────────────────────────────────
|
| 31 |
+
HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
|
| 32 |
+
TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
|
| 33 |
+
DEVICE="${CUDA_VISIBLE_DEVICES:-0}" # 기본: GPU 0번만 사용
|
| 34 |
+
BATCH_SIZE="auto"
|
| 35 |
+
|
| 36 |
+
# 빠른 체크 태스크 (약 2,000 샘플, ~20분)
|
| 37 |
+
TASKS="kobest_boolq,kobest_copa,haerae_general_knowledge,haerae_history,paws_ko"
|
| 38 |
+
|
| 39 |
+
# ─── 의존성 확인 ─────────────────────────────────────────
|
| 40 |
+
check_dep() {
|
| 41 |
+
python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
|
| 42 |
+
}
|
| 43 |
+
check_dep lm_eval lm-eval
|
| 44 |
+
check_dep transformers transformers
|
| 45 |
+
check_dep safetensors safetensors
|
| 46 |
+
|
| 47 |
+
echo "=================================================="
|
| 48 |
+
echo " Ko-LLM Quick Eval"
|
| 49 |
+
echo "=================================================="
|
| 50 |
+
echo " Checkpoint : $CHECKPOINT"
|
| 51 |
+
echo " HF output : $HF_MODEL_DIR"
|
| 52 |
+
echo " Tasks : $TASKS"
|
| 53 |
+
echo " Output : $OUTPUT_DIR"
|
| 54 |
+
echo " Device : cuda:$DEVICE"
|
| 55 |
+
echo "=================================================="
|
| 56 |
+
|
| 57 |
+
mkdir -p "$OUTPUT_DIR"
|
| 58 |
+
|
| 59 |
+
# ─── Step 1: HF 포맷 변환 ───────────────────────────────
|
| 60 |
+
if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
|
| 61 |
+
echo ""
|
| 62 |
+
echo "▶ Step 1: 커스텀 체크포인트 → HF 포맷 변환..."
|
| 63 |
+
python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
|
| 64 |
+
--checkpoint "$CHECKPOINT" \
|
| 65 |
+
--output "$HF_MODEL_DIR" \
|
| 66 |
+
--tokenizer "$TOKENIZER"
|
| 67 |
+
echo "✅ HF 변환 완료: $HF_MODEL_DIR"
|
| 68 |
+
else
|
| 69 |
+
echo "▶ Step 1: HF 모델 이미 존재, 변환 스킵"
|
| 70 |
+
echo " $HF_MODEL_DIR"
|
| 71 |
+
fi
|
| 72 |
+
|
| 73 |
+
# ─── Step 2: lm-eval 실행 ───────────────────────────────
|
| 74 |
+
echo ""
|
| 75 |
+
echo "▶ Step 2: lm-eval 평가 시작..."
|
| 76 |
+
START_TIME=$(date +%s)
|
| 77 |
+
|
| 78 |
+
CUDA_VISIBLE_DEVICES="$DEVICE" python3 -m lm_eval \
|
| 79 |
+
--model hf \
|
| 80 |
+
--model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
|
| 81 |
+
--tasks "$TASKS" \
|
| 82 |
+
--num_fewshot 0 \
|
| 83 |
+
--batch_size "$BATCH_SIZE" \
|
| 84 |
+
--output_path "$OUTPUT_DIR" \
|
| 85 |
+
--log_samples \
|
| 86 |
+
--verbosity INFO \
|
| 87 |
+
2>&1 | tee "$OUTPUT_DIR/eval.log"
|
| 88 |
+
|
| 89 |
+
END_TIME=$(date +%s)
|
| 90 |
+
ELAPSED=$(( END_TIME - START_TIME ))
|
| 91 |
+
|
| 92 |
+
echo ""
|
| 93 |
+
echo "=================================================="
|
| 94 |
+
echo "✅ 평가 완료!"
|
| 95 |
+
echo " 소요시간: $((ELAPSED / 60))분 $((ELAPSED % 60))초"
|
| 96 |
+
echo " 결과 저장: $OUTPUT_DIR"
|
| 97 |
+
echo "=================================================="
|
| 98 |
+
|
| 99 |
+
# ─── Step 3: 결과 요약 출력 ─────────────────────────────
|
| 100 |
+
echo ""
|
| 101 |
+
echo "▶ Step 3: 결과 요약"
|
| 102 |
+
python3 - <<'PYEOF'
|
| 103 |
+
import json, glob, sys, os
|
| 104 |
+
|
| 105 |
+
output_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
| 106 |
+
results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True)
|
| 107 |
+
results_files = [f for f in results_files if "results" in f.lower()]
|
| 108 |
+
|
| 109 |
+
if not results_files:
|
| 110 |
+
print("결과 JSON 파일 없음. eval.log 확인하세요.")
|
| 111 |
+
sys.exit(0)
|
| 112 |
+
|
| 113 |
+
for rf in results_files:
|
| 114 |
+
try:
|
| 115 |
+
with open(rf) as f:
|
| 116 |
+
data = json.load(f)
|
| 117 |
+
results = data.get("results", {})
|
| 118 |
+
print(f"\n{'='*50}")
|
| 119 |
+
print(f"Task Results (from {os.path.basename(rf)})")
|
| 120 |
+
print(f"{'='*50}")
|
| 121 |
+
for task, metrics in results.items():
|
| 122 |
+
print(f"\n{task}:")
|
| 123 |
+
for key, val in metrics.items():
|
| 124 |
+
if "stderr" not in key and isinstance(val, (int, float)):
|
| 125 |
+
print(f" {key}: {val:.4f}")
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"파싱 실패: {rf}: {e}")
|
| 128 |
+
PYEOF
|
| 129 |
+
python3 - "$OUTPUT_DIR" <<'PYEOF'
|
| 130 |
+
import json, glob, sys, os
|
| 131 |
+
output_dir = sys.argv[1] if len(sys.argv) > 1 else "."
|
| 132 |
+
results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True)
|
| 133 |
+
results_files = [f for f in results_files if "results" in os.path.basename(f)]
|
| 134 |
+
if not results_files:
|
| 135 |
+
# try finding any json
|
| 136 |
+
results_files = glob.glob(f"{output_dir}/*.json")
|
| 137 |
+
for rf in results_files[:3]:
|
| 138 |
+
try:
|
| 139 |
+
with open(rf) as f:
|
| 140 |
+
data = json.load(f)
|
| 141 |
+
results = data.get("results", {})
|
| 142 |
+
print(f"\n{'='*50}\nTask Results: {os.path.basename(rf)}\n{'='*50}")
|
| 143 |
+
for task, metrics in results.items():
|
| 144 |
+
print(f"\n{task}:")
|
| 145 |
+
for key, val in metrics.items():
|
| 146 |
+
if "stderr" not in key and isinstance(val, (int, float)):
|
| 147 |
+
print(f" {key}: {val:.4f}")
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"파싱 실패: {rf}: {e}")
|
| 150 |
+
PYEOF
|
source/scripts/run_pretrain.sh
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Usage: bash scripts/run_pretrain.sh [additional torchrun args]
|
| 3 |
+
# Runs 8-GPU DDP pretraining via torchrun.
|
| 4 |
+
#
|
| 5 |
+
# Any extra arguments are forwarded verbatim to pretrain.py.
|
| 6 |
+
# Examples:
|
| 7 |
+
# bash scripts/run_pretrain.sh --max_steps 200000
|
| 8 |
+
# bash scripts/run_pretrain.sh --resume checkpoints/checkpoint-0010000
|
| 9 |
+
|
| 10 |
+
set -euo pipefail
|
| 11 |
+
|
| 12 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 13 |
+
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
| 14 |
+
|
| 15 |
+
torchrun \
|
| 16 |
+
--nproc_per_node=8 \
|
| 17 |
+
--master_port=29500 \
|
| 18 |
+
"$PROJECT_DIR/train/pretrain.py" \
|
| 19 |
+
--config "$PROJECT_DIR/configs/small.yaml" \
|
| 20 |
+
--train_data "$PROJECT_DIR/data/train.bin" \
|
| 21 |
+
--val_data "$PROJECT_DIR/data/val.bin" \
|
| 22 |
+
--checkpoint_dir "$PROJECT_DIR/checkpoints" \
|
| 23 |
+
--batch_size 8 \
|
| 24 |
+
--grad_accum 4 \
|
| 25 |
+
--warmup_steps 2000 \
|
| 26 |
+
"$@"
|
source/scripts/start-gateway.sh
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# start-gateway.sh — OpenClaw 게이트웨이 직접 시작 (독립 프로세스)
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
RNTIER_HOME="${RNTIER_HOME:-$HOME}"
|
| 6 |
+
export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
|
| 7 |
+
export HOME="${HOME:-/home/ghong}"
|
| 8 |
+
export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
|
| 9 |
+
export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
|
| 10 |
+
|
| 11 |
+
LOG_DIR="/tmp/openclaw"
|
| 12 |
+
GATEWAY_LOG="${LOG_DIR}/gateway.log"
|
| 13 |
+
PID_FILE="/tmp/openclaw-gateway.pid"
|
| 14 |
+
|
| 15 |
+
mkdir -p "$LOG_DIR"
|
| 16 |
+
|
| 17 |
+
# 기존 프로세스 정리
|
| 18 |
+
pkill -f "openclaw.*gateway" 2>/dev/null || true
|
| 19 |
+
sleep 2
|
| 20 |
+
|
| 21 |
+
# 게이트웨이 시작 — setsid로 완전 분리
|
| 22 |
+
setsid nohup "${RNTIER_HOME}/.npm-global/bin/openclaw" gateway run \
|
| 23 |
+
--port 18789 \
|
| 24 |
+
--bind loopback \
|
| 25 |
+
>> "$GATEWAY_LOG" 2>&1 < /dev/null &
|
| 26 |
+
|
| 27 |
+
PID=$!
|
| 28 |
+
echo "$PID" > "$PID_FILE"
|
| 29 |
+
date +%s > /tmp/openclaw-last-restart
|
| 30 |
+
|
| 31 |
+
echo "[$(date)] Gateway launched with PID $PID"
|
| 32 |
+
|
| 33 |
+
# 10초 대기 후 상태 확인
|
| 34 |
+
sleep 10
|
| 35 |
+
|
| 36 |
+
if kill -0 "$PID" 2>/dev/null; then
|
| 37 |
+
echo "[$(date)] OK: Gateway PID $PID is alive"
|
| 38 |
+
ss -tlnH "sport = :18789" 2>/dev/null && echo "[$(date)] OK: Port 18789 is listening" || echo "[$(date)] WARN: Port 18789 not yet listening"
|
| 39 |
+
else
|
| 40 |
+
echo "[$(date)] FAIL: Gateway PID $PID died"
|
| 41 |
+
echo "--- Last 20 lines of gateway.log ---"
|
| 42 |
+
tail -20 "$GATEWAY_LOG" 2>/dev/null
|
| 43 |
+
exit 1
|
| 44 |
+
fi
|
source/scripts/telegram_notify.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Standalone Telegram notification helper for FRANKENSTALLM 3B training.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python3 scripts/telegram_notify.py "Your message here"
|
| 7 |
+
python3 scripts/telegram_notify.py "<b>Bold</b> message" --parse-mode HTML
|
| 8 |
+
|
| 9 |
+
Function API:
|
| 10 |
+
from scripts.telegram_notify import send_telegram
|
| 11 |
+
send_telegram("message text")
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import json
|
| 17 |
+
import urllib.request
|
| 18 |
+
import urllib.parse
|
| 19 |
+
import urllib.error
|
| 20 |
+
import logging
|
| 21 |
+
from typing import Optional
|
| 22 |
+
|
| 23 |
+
# ─── Configuration ────────────────────────────────────────────────────────────
|
| 24 |
+
BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
| 25 |
+
CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
|
| 26 |
+
TIMEOUT = 15 # seconds
|
| 27 |
+
MAX_MSG_LEN = 4096 # Telegram limit
|
| 28 |
+
|
| 29 |
+
logging.basicConfig(
|
| 30 |
+
level=logging.WARNING,
|
| 31 |
+
format="%(asctime)s [telegram_notify] %(levelname)s: %(message)s",
|
| 32 |
+
)
|
| 33 |
+
log = logging.getLogger("telegram_notify")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def send_telegram(
|
| 37 |
+
message: str,
|
| 38 |
+
parse_mode: str = "HTML",
|
| 39 |
+
token: str = BOT_TOKEN,
|
| 40 |
+
chat_id: str = CHAT_ID,
|
| 41 |
+
disable_web_page_preview: bool = True,
|
| 42 |
+
) -> bool:
|
| 43 |
+
"""
|
| 44 |
+
Send a Telegram message via Bot API using urllib (curl-free).
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
message: Text to send (HTML or Markdown depending on parse_mode).
|
| 48 |
+
parse_mode: "HTML" or "Markdown" or "" (plain).
|
| 49 |
+
token: Bot token (defaults to module-level BOT_TOKEN).
|
| 50 |
+
chat_id: Recipient chat/channel ID.
|
| 51 |
+
disable_web_page_preview: Suppress link previews.
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
True on success, False on any error.
|
| 55 |
+
"""
|
| 56 |
+
if not message:
|
| 57 |
+
log.warning("Empty message — skipping send.")
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
# Truncate if over Telegram limit, with notice
|
| 61 |
+
if len(message) > MAX_MSG_LEN:
|
| 62 |
+
truncated_notice = "\n\n<i>[message truncated]</i>" if parse_mode == "HTML" else "\n\n[message truncated]"
|
| 63 |
+
message = message[: MAX_MSG_LEN - len(truncated_notice)] + truncated_notice
|
| 64 |
+
|
| 65 |
+
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
| 66 |
+
|
| 67 |
+
payload: dict = {
|
| 68 |
+
"chat_id": chat_id,
|
| 69 |
+
"text": message,
|
| 70 |
+
"disable_web_page_preview": disable_web_page_preview,
|
| 71 |
+
}
|
| 72 |
+
if parse_mode:
|
| 73 |
+
payload["parse_mode"] = parse_mode
|
| 74 |
+
|
| 75 |
+
data = urllib.parse.urlencode(payload).encode("utf-8")
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
req = urllib.request.Request(
|
| 79 |
+
url,
|
| 80 |
+
data=data,
|
| 81 |
+
method="POST",
|
| 82 |
+
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
| 83 |
+
)
|
| 84 |
+
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
|
| 85 |
+
body = resp.read().decode("utf-8")
|
| 86 |
+
result = json.loads(body)
|
| 87 |
+
if result.get("ok"):
|
| 88 |
+
return True
|
| 89 |
+
else:
|
| 90 |
+
log.error("Telegram API error: %s", result.get("description", result))
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
except urllib.error.HTTPError as e:
|
| 94 |
+
try:
|
| 95 |
+
err_body = e.read().decode("utf-8")
|
| 96 |
+
except Exception:
|
| 97 |
+
err_body = str(e)
|
| 98 |
+
log.error("HTTP %d from Telegram: %s", e.code, err_body)
|
| 99 |
+
return False
|
| 100 |
+
|
| 101 |
+
except urllib.error.URLError as e:
|
| 102 |
+
log.error("Network error sending Telegram message: %s", e.reason)
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
except json.JSONDecodeError as e:
|
| 106 |
+
log.error("Failed to parse Telegram response: %s", e)
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
except Exception as e: # noqa: BLE001
|
| 110 |
+
log.error("Unexpected error in send_telegram: %s", e)
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def send_telegram_safe(message: str, **kwargs) -> bool:
|
| 115 |
+
"""
|
| 116 |
+
Wrapper that catches ALL exceptions — guaranteed never to crash the caller.
|
| 117 |
+
Suitable for embedding in training loops where stability is critical.
|
| 118 |
+
"""
|
| 119 |
+
try:
|
| 120 |
+
return send_telegram(message, **kwargs)
|
| 121 |
+
except Exception as e: # noqa: BLE001
|
| 122 |
+
log.error("send_telegram_safe caught unhandled exception: %s", e)
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# ─── CLI entry point ──────────────────────────────────────────────────────────
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
import argparse
|
| 129 |
+
|
| 130 |
+
parser = argparse.ArgumentParser(
|
| 131 |
+
description="Send a Telegram message from the command line."
|
| 132 |
+
)
|
| 133 |
+
parser.add_argument("message", nargs="?", help="Message text to send")
|
| 134 |
+
parser.add_argument(
|
| 135 |
+
"--parse-mode",
|
| 136 |
+
default="HTML",
|
| 137 |
+
choices=["HTML", "Markdown", "MarkdownV2", ""],
|
| 138 |
+
help="Telegram parse_mode (default: HTML)",
|
| 139 |
+
)
|
| 140 |
+
parser.add_argument(
|
| 141 |
+
"--token", default=BOT_TOKEN, help="Override bot token"
|
| 142 |
+
)
|
| 143 |
+
parser.add_argument(
|
| 144 |
+
"--chat-id", default=CHAT_ID, help="Override chat ID"
|
| 145 |
+
)
|
| 146 |
+
args = parser.parse_args()
|
| 147 |
+
|
| 148 |
+
# Allow piped stdin if no positional arg given
|
| 149 |
+
if args.message is None:
|
| 150 |
+
if not sys.stdin.isatty():
|
| 151 |
+
args.message = sys.stdin.read().strip()
|
| 152 |
+
else:
|
| 153 |
+
parser.print_help()
|
| 154 |
+
sys.exit(1)
|
| 155 |
+
|
| 156 |
+
ok = send_telegram(
|
| 157 |
+
args.message,
|
| 158 |
+
parse_mode=args.parse_mode,
|
| 159 |
+
token=args.token,
|
| 160 |
+
chat_id=args.chat_id,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
if ok:
|
| 164 |
+
print("Telegram message sent successfully.")
|
| 165 |
+
sys.exit(0)
|
| 166 |
+
else:
|
| 167 |
+
print("ERROR: Failed to send Telegram message.", file=sys.stderr)
|
| 168 |
+
sys.exit(1)
|
source/scripts/test_ollama_repetition.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
test_ollama_repetition.py — Ollama 배포 모델 반복률 검증
|
| 4 |
+
|
| 5 |
+
ORPO eval과 동일한 프롬프트로 Ollama API 호출 후 n-gram 반복률 + EOS 종료율 측정.
|
| 6 |
+
목표: 3-gram rep < 3% (한국어 자연 반복 고려), EOS 종료율 > 95%
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python scripts/test_ollama_repetition.py [--model frankenstallm-3b] [--host localhost:11434]
|
| 10 |
+
"""
|
| 11 |
+
import argparse
|
| 12 |
+
import json
|
| 13 |
+
import urllib.request
|
| 14 |
+
import urllib.error
|
| 15 |
+
import sys
|
| 16 |
+
from collections import Counter
|
| 17 |
+
|
| 18 |
+
# ORPO eval에서 사용한 15개 한국어 프롬프트
|
| 19 |
+
TEST_PROMPTS = [
|
| 20 |
+
"대한민국의 수도는 어디인가요?",
|
| 21 |
+
"인공지능이란 무엇인가요?",
|
| 22 |
+
"한국의 전통 음식 중에서 김치에 대해 설명해주세요.",
|
| 23 |
+
"프로그래밍을 배우려면 어떻게 해야 하나요?",
|
| 24 |
+
"지구 온난화의 원인과 대책에 대해 설명해주세요.",
|
| 25 |
+
"한국어의 특징을 3가지 설명해주세요.",
|
| 26 |
+
"좋은 리더의 자질에 대해 논해주세요.",
|
| 27 |
+
"우주 탐사의 의미와 중요성을 설명해주세요.",
|
| 28 |
+
"건강한 생활 습관 5가지를 추천해주세요.",
|
| 29 |
+
"인터넷이 현대 사회에 미친 영향을 분석해주세요.",
|
| 30 |
+
"한국의 교육 제도의 장단점을 설명해주세요.",
|
| 31 |
+
"환경 보호를 위해 개인이 할 수 있는 일을 알려주세요.",
|
| 32 |
+
"4차 산업혁명이 일자리에 미치는 영향을 분석해주세요.",
|
| 33 |
+
"독서의 중요성과 효과적인 독서 방법을 알려주세요.",
|
| 34 |
+
"한국 문화의 세계화에 대해 어떻게 생각하시나요?",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def compute_ngram_repetition(text: str, n: int) -> float:
|
| 39 |
+
"""n-gram 반복률 계산 (0.0 ~ 1.0)"""
|
| 40 |
+
tokens = text.split()
|
| 41 |
+
if len(tokens) < n:
|
| 42 |
+
return 0.0
|
| 43 |
+
ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
|
| 44 |
+
if not ngrams:
|
| 45 |
+
return 0.0
|
| 46 |
+
counts = Counter(ngrams)
|
| 47 |
+
repeated = sum(c - 1 for c in counts.values() if c > 1)
|
| 48 |
+
return repeated / len(ngrams)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def call_ollama(prompt: str, model: str, host: str, timeout: int = 120) -> dict:
|
| 52 |
+
"""Ollama API 호출"""
|
| 53 |
+
url = f"http://{host}/api/generate"
|
| 54 |
+
payload = json.dumps({
|
| 55 |
+
"model": model,
|
| 56 |
+
"prompt": prompt,
|
| 57 |
+
"stream": False,
|
| 58 |
+
}).encode("utf-8")
|
| 59 |
+
|
| 60 |
+
req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
|
| 61 |
+
try:
|
| 62 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 63 |
+
return json.loads(resp.read().decode("utf-8"))
|
| 64 |
+
except urllib.error.URLError as e:
|
| 65 |
+
return {"error": str(e), "response": ""}
|
| 66 |
+
except Exception as e:
|
| 67 |
+
return {"error": str(e), "response": ""}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def main():
|
| 71 |
+
parser = argparse.ArgumentParser(description="Ollama 반복률 검증")
|
| 72 |
+
parser.add_argument("--model", default="frankenstallm-3b", help="Ollama 모델 이름")
|
| 73 |
+
parser.add_argument("--host", default="localhost:11434", help="Ollama 서버 주소")
|
| 74 |
+
args = parser.parse_args()
|
| 75 |
+
|
| 76 |
+
print(f"{'='*70}")
|
| 77 |
+
print(f" Ollama 반복률 검증: {args.model}")
|
| 78 |
+
print(f" 서버: {args.host}")
|
| 79 |
+
print(f" 프롬프트: {len(TEST_PROMPTS)}개")
|
| 80 |
+
print(f"{'='*70}\n")
|
| 81 |
+
|
| 82 |
+
results = []
|
| 83 |
+
for i, prompt in enumerate(TEST_PROMPTS, 1):
|
| 84 |
+
print(f"[{i:2d}/{len(TEST_PROMPTS)}] {prompt[:40]}...")
|
| 85 |
+
resp = call_ollama(prompt, args.model, args.host)
|
| 86 |
+
|
| 87 |
+
if "error" in resp and resp["error"]:
|
| 88 |
+
print(f" ERROR: {resp['error']}")
|
| 89 |
+
results.append({"prompt": prompt, "error": resp["error"]})
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
text = resp.get("response", "")
|
| 93 |
+
eos_done = resp.get("done", False)
|
| 94 |
+
|
| 95 |
+
rep1 = compute_ngram_repetition(text, 1)
|
| 96 |
+
rep2 = compute_ngram_repetition(text, 2)
|
| 97 |
+
rep3 = compute_ngram_repetition(text, 3)
|
| 98 |
+
rep4 = compute_ngram_repetition(text, 4)
|
| 99 |
+
|
| 100 |
+
results.append({
|
| 101 |
+
"prompt": prompt,
|
| 102 |
+
"response_len": len(text),
|
| 103 |
+
"word_count": len(text.split()),
|
| 104 |
+
"eos_done": eos_done,
|
| 105 |
+
"rep1": rep1, "rep2": rep2, "rep3": rep3, "rep4": rep4,
|
| 106 |
+
})
|
| 107 |
+
|
| 108 |
+
preview = text[:100].replace("\n", " ")
|
| 109 |
+
print(f" 응답: {preview}...")
|
| 110 |
+
print(f" 길이: {len(text)}자, EOS: {eos_done}, "
|
| 111 |
+
f"rep(1/2/3/4): {rep1:.2%}/{rep2:.2%}/{rep3:.2%}/{rep4:.2%}")
|
| 112 |
+
print()
|
| 113 |
+
|
| 114 |
+
# --- Summary ---
|
| 115 |
+
valid = [r for r in results if "error" not in r or not r.get("error")]
|
| 116 |
+
if not valid:
|
| 117 |
+
print("ERROR: 유효한 응답 없음")
|
| 118 |
+
sys.exit(1)
|
| 119 |
+
|
| 120 |
+
avg_rep3 = sum(r["rep3"] for r in valid) / len(valid)
|
| 121 |
+
eos_rate = sum(1 for r in valid if r["eos_done"]) / len(valid)
|
| 122 |
+
errors = len(results) - len(valid)
|
| 123 |
+
|
| 124 |
+
print(f"{'='*70}")
|
| 125 |
+
print(f" 결과 요약")
|
| 126 |
+
print(f"{'='*70}")
|
| 127 |
+
print(f" 유효 응답: {len(valid)}/{len(results)} (에러: {errors})")
|
| 128 |
+
print(f" 평균 3-gram 반복률: {avg_rep3:.2%} (목표: < 3%)")
|
| 129 |
+
print(f" EOS 종료율: {eos_rate:.0%} (목표: > 95%)")
|
| 130 |
+
print()
|
| 131 |
+
|
| 132 |
+
# Pass/Fail
|
| 133 |
+
# 한국어는 조사/접속사 자연 반복으로 어절 기준 3-gram rep 1.5~2%가 자연 floor
|
| 134 |
+
# 퇴행적 반복(30%+)과 구별하여 3% 기준 적용
|
| 135 |
+
rep_pass = avg_rep3 < 0.03
|
| 136 |
+
eos_pass = eos_rate > 0.95
|
| 137 |
+
overall = rep_pass and eos_pass
|
| 138 |
+
|
| 139 |
+
print(f" 3-gram 반복률: {'PASS ✓' if rep_pass else 'FAIL ✗'} ({avg_rep3:.2%})")
|
| 140 |
+
print(f" EOS 종료율: {'PASS ✓' if eos_pass else 'FAIL ✗'} ({eos_rate:.0%})")
|
| 141 |
+
print(f" 종합: {'PASS ✓' if overall else 'FAIL ✗'}")
|
| 142 |
+
print(f"{'='*70}")
|
| 143 |
+
|
| 144 |
+
sys.exit(0 if overall else 1)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
if __name__ == "__main__":
|
| 148 |
+
main()
|
source/scripts/training_watchdog.sh
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# training_watchdog.sh — FRANKENSTALLM 3B Cron-based Training Watchdog
|
| 4 |
+
# Run: every 10 minutes via cron
|
| 5 |
+
# Alerts via Telegram only when problems are detected.
|
| 6 |
+
# =============================================================================
|
| 7 |
+
set -euo pipefail
|
| 8 |
+
|
| 9 |
+
# ─── Paths ───────────────────────────────────────────────────────────────────
|
| 10 |
+
WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
|
| 11 |
+
CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
|
| 12 |
+
LOG_FILE="$CKPT_DIR/train.log"
|
| 13 |
+
PID_FILE="$CKPT_DIR/train.pid"
|
| 14 |
+
WATCHDOG_LOG="$CKPT_DIR/watchdog.log"
|
| 15 |
+
STATE_FILE="$CKPT_DIR/watchdog.state" # persists last-good step/time
|
| 16 |
+
NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"
|
| 17 |
+
|
| 18 |
+
# ─── Thresholds ──────────────────────────────────────────────────────────────
|
| 19 |
+
LOSS_SPIKE_THRESHOLD="5.0" # alert if loss > this value
|
| 20 |
+
LOSS_NAN_PATTERN="nan|inf|NaN|Inf"
|
| 21 |
+
STALL_SECONDS=900 # 15 min without new log line → stalled
|
| 22 |
+
DISK_WARN_PCT=85 # alert if disk usage >= this %
|
| 23 |
+
GPU_UTIL_WARN_PCT=20 # alert if avg GPU util drops below this %
|
| 24 |
+
MIN_TOKPS=5000 # alert if tok/s drops below this
|
| 25 |
+
TOTAL_STEPS=57000
|
| 26 |
+
WAIT_COUNT_FILE="/tmp/frankenstallm-wait-count" # 대기 횟수 파일
|
| 27 |
+
MAX_WAIT_COUNT=10 # 이 횟수 초과 시 알림 후 cron 해제
|
| 28 |
+
|
| 29 |
+
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
| 30 |
+
ts() { date '+%Y-%m-%d %H:%M:%S'; }
|
| 31 |
+
|
| 32 |
+
log_msg() {
|
| 33 |
+
echo "[$(ts)] $*"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
send_alert() {
|
| 37 |
+
local level="$1"
|
| 38 |
+
local msg="$2"
|
| 39 |
+
log_msg "ALERT[$level]: $msg"
|
| 40 |
+
$NOTIFY "<b>[FRANKENSTALLM ALERT] $level</b>
|
| 41 |
+
|
| 42 |
+
$msg
|
| 43 |
+
|
| 44 |
+
<i>$(ts) | watchdog check</i>" || true
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# ─── 1. Process alive check ──────────────────────────────────────────────────
|
| 48 |
+
check_process() {
|
| 49 |
+
if [[ ! -f "$PID_FILE" ]]; then
|
| 50 |
+
# 대기 모드: PID 파일 없으면 학습 미시작 상태로 카운트
|
| 51 |
+
local wait_count=0
|
| 52 |
+
[[ -f "$WAIT_COUNT_FILE" ]] && wait_count=$(cat "$WAIT_COUNT_FILE" 2>/dev/null || echo 0)
|
| 53 |
+
wait_count=$(( wait_count + 1 ))
|
| 54 |
+
echo "$wait_count" > "$WAIT_COUNT_FILE"
|
| 55 |
+
log_msg "Training not started yet (waiting ${wait_count}/${MAX_WAIT_COUNT})."
|
| 56 |
+
|
| 57 |
+
if (( wait_count > MAX_WAIT_COUNT )); then
|
| 58 |
+
send_alert "WAIT_TIMEOUT" "학습이 <b>${wait_count}회</b> 체크 동안 시작되지 않았습니다 (~$((wait_count * 10))분).
|
| 59 |
+
|
| 60 |
+
PID 파일 없음: <code>$PID_FILE</code>
|
| 61 |
+
|
| 62 |
+
Watchdog cron을 자동 해제합니다. 학습 시작 후 직접 재등록하세요:
|
| 63 |
+
<code>crontab -e</code>"
|
| 64 |
+
# cron에서 training_watchdog 제거
|
| 65 |
+
crontab -l 2>/dev/null | grep -v "training_watchdog" | crontab -
|
| 66 |
+
rm -f "$WAIT_COUNT_FILE"
|
| 67 |
+
log_msg "Watchdog cron entry removed after ${wait_count} waits."
|
| 68 |
+
fi
|
| 69 |
+
return 1
|
| 70 |
+
fi
|
| 71 |
+
# 학습 시작됨 → 대기 카운터 초기화
|
| 72 |
+
rm -f "$WAIT_COUNT_FILE"
|
| 73 |
+
|
| 74 |
+
local pid
|
| 75 |
+
pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
|
| 76 |
+
|
| 77 |
+
if [[ -z "$pid" ]]; then
|
| 78 |
+
send_alert "PROCESS" "PID file is empty: $PID_FILE"
|
| 79 |
+
return 1
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
if ! kill -0 "$pid" 2>/dev/null; then
|
| 83 |
+
# Check if it completed normally (step == TOTAL_STEPS)
|
| 84 |
+
local last_step
|
| 85 |
+
last_step=$(grep -oP 'step\s+\K[0-9]+' "$LOG_FILE" 2>/dev/null | tail -1)
|
| 86 |
+
if [[ "$last_step" == "$TOTAL_STEPS" ]]; then
|
| 87 |
+
log_msg "Training COMPLETED at step $TOTAL_STEPS — process exit is expected."
|
| 88 |
+
send_alert "COMPLETE" "Training completed normally at step <code>$TOTAL_STEPS/$TOTAL_STEPS</code>."
|
| 89 |
+
else
|
| 90 |
+
send_alert "CRASH" "Training process (PID $pid) is NOT running.
|
| 91 |
+
Last logged step: <code>${last_step:-unknown}</code>/$TOTAL_STEPS
|
| 92 |
+
|
| 93 |
+
Check log: <code>tail -50 $LOG_FILE</code>"
|
| 94 |
+
fi
|
| 95 |
+
return 1
|
| 96 |
+
fi
|
| 97 |
+
|
| 98 |
+
log_msg "Process PID $pid is alive."
|
| 99 |
+
return 0
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# ─── 2. Stall detection ──────────────────────────────────────────────────────
|
| 103 |
+
check_stall() {
|
| 104 |
+
if [[ ! -f "$LOG_FILE" ]]; then
|
| 105 |
+
send_alert "STALL" "Log file not found: $LOG_FILE"
|
| 106 |
+
return 1
|
| 107 |
+
fi
|
| 108 |
+
|
| 109 |
+
local log_mtime now elapsed
|
| 110 |
+
log_mtime=$(stat -c '%Y' "$LOG_FILE" 2>/dev/null || echo 0)
|
| 111 |
+
now=$(date +%s)
|
| 112 |
+
elapsed=$(( now - log_mtime ))
|
| 113 |
+
|
| 114 |
+
if (( elapsed >= STALL_SECONDS )); then
|
| 115 |
+
local mins=$(( elapsed / 60 ))
|
| 116 |
+
send_alert "STALL" "No log activity for <b>${mins} minutes</b> (threshold: $(( STALL_SECONDS/60 ))min).
|
| 117 |
+
Log last modified: <code>$(date -d "@$log_mtime" '+%Y-%m-%d %H:%M:%S')</code>
|
| 118 |
+
Training may be hung or extremely slow."
|
| 119 |
+
return 1
|
| 120 |
+
fi
|
| 121 |
+
|
| 122 |
+
log_msg "Log freshness OK: last update ${elapsed}s ago."
|
| 123 |
+
return 0
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
# ─── 3. Loss anomaly check ───────────────────────────────────────────────────
|
| 127 |
+
check_loss() {
|
| 128 |
+
if [[ ! -f "$LOG_FILE" ]]; then
|
| 129 |
+
return 0
|
| 130 |
+
fi
|
| 131 |
+
|
| 132 |
+
# Get last step line
|
| 133 |
+
local last_line
|
| 134 |
+
last_line=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1)
|
| 135 |
+
|
| 136 |
+
if [[ -z "$last_line" ]]; then
|
| 137 |
+
log_msg "No step lines found in log yet."
|
| 138 |
+
return 0
|
| 139 |
+
fi
|
| 140 |
+
|
| 141 |
+
local loss step
|
| 142 |
+
loss=$(echo "$last_line" | grep -oP 'loss\s+\K[0-9.eE+\-naifNIF]+' || echo "")
|
| 143 |
+
step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0")
|
| 144 |
+
|
| 145 |
+
if [[ -z "$loss" ]]; then
|
| 146 |
+
log_msg "Could not parse loss from: $last_line"
|
| 147 |
+
return 0
|
| 148 |
+
fi
|
| 149 |
+
|
| 150 |
+
# NaN/Inf check
|
| 151 |
+
if echo "$loss" | grep -qiE "$LOSS_NAN_PATTERN"; then
|
| 152 |
+
send_alert "LOSS_NAN" "Loss is <b>$loss</b> at step <code>$step</code>.
|
| 153 |
+
Training has diverged — NaN/Inf detected.
|
| 154 |
+
|
| 155 |
+
Last log line:
|
| 156 |
+
<code>${last_line}</code>"
|
| 157 |
+
return 1
|
| 158 |
+
fi
|
| 159 |
+
|
| 160 |
+
# Spike check (only after warmup, step > 500)
|
| 161 |
+
if (( step > 500 )); then
|
| 162 |
+
local loss_int
|
| 163 |
+
loss_int=$(echo "$loss >= $LOSS_SPIKE_THRESHOLD" | bc -l 2>/dev/null || echo 0)
|
| 164 |
+
if [[ "$loss_int" == "1" ]]; then
|
| 165 |
+
send_alert "LOSS_SPIKE" "Loss spike detected: <b>$loss</b> at step <code>$step</code> (threshold: $LOSS_SPIKE_THRESHOLD).
|
| 166 |
+
|
| 167 |
+
Last log line:
|
| 168 |
+
<code>${last_line}</code>"
|
| 169 |
+
return 1
|
| 170 |
+
fi
|
| 171 |
+
fi
|
| 172 |
+
|
| 173 |
+
log_msg "Loss OK: $loss at step $step."
|
| 174 |
+
return 0
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
# ─── 4. Throughput check ─────────────────────────────────────────────────────
|
| 178 |
+
check_throughput() {
|
| 179 |
+
if [[ ! -f "$LOG_FILE" ]]; then
|
| 180 |
+
return 0
|
| 181 |
+
fi
|
| 182 |
+
|
| 183 |
+
local last_line
|
| 184 |
+
last_line=$(grep -E 'step\s+[0-9]+.*tok/s' "$LOG_FILE" 2>/dev/null | tail -1)
|
| 185 |
+
[[ -z "$last_line" ]] && return 0
|
| 186 |
+
|
| 187 |
+
# tok/s may be formatted with commas: 36,321
|
| 188 |
+
local tokps step
|
| 189 |
+
tokps=$(echo "$last_line" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "")
|
| 190 |
+
step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0")
|
| 191 |
+
|
| 192 |
+
if [[ -z "$tokps" ]]; then
|
| 193 |
+
log_msg "Could not parse tok/s from last log line."
|
| 194 |
+
return 0
|
| 195 |
+
fi
|
| 196 |
+
|
| 197 |
+
if (( step > 100 && tokps < MIN_TOKPS )); then
|
| 198 |
+
send_alert "THROUGHPUT" "Throughput dropped to <b>${tokps} tok/s</b> at step <code>$step</code> (min: ${MIN_TOKPS}).
|
| 199 |
+
GPU may be throttling, NCCL stalled, or a data worker is slow."
|
| 200 |
+
return 1
|
| 201 |
+
fi
|
| 202 |
+
|
| 203 |
+
log_msg "Throughput OK: ${tokps} tok/s at step $step."
|
| 204 |
+
return 0
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
# ─── 5. GPU utilization check ────────────────────────────────────────────────
|
| 208 |
+
check_gpu() {
|
| 209 |
+
if ! command -v nvidia-smi &>/dev/null; then
|
| 210 |
+
log_msg "nvidia-smi not available — skipping GPU check."
|
| 211 |
+
return 0
|
| 212 |
+
fi
|
| 213 |
+
|
| 214 |
+
local avg_util
|
| 215 |
+
avg_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null \
|
| 216 |
+
| awk '{sum+=$1; count++} END {if(count>0) printf "%.0f", sum/count; else print 0}')
|
| 217 |
+
|
| 218 |
+
if [[ -z "$avg_util" || "$avg_util" == "0" ]]; then
|
| 219 |
+
log_msg "GPU util query returned 0 or empty — possibly all idle."
|
| 220 |
+
# Only alert if process is also running
|
| 221 |
+
local pid
|
| 222 |
+
pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
|
| 223 |
+
if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
|
| 224 |
+
send_alert "GPU_IDLE" "All 8× B200 GPUs show <b>0% utilization</b> while training process is alive.
|
| 225 |
+
Possible NCCL hang or data pipeline stall."
|
| 226 |
+
return 1
|
| 227 |
+
fi
|
| 228 |
+
return 0
|
| 229 |
+
fi
|
| 230 |
+
|
| 231 |
+
if (( avg_util < GPU_UTIL_WARN_PCT )); then
|
| 232 |
+
local gpu_details
|
| 233 |
+
gpu_details=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
|
| 234 |
+
--format=csv,noheader 2>/dev/null | head -8 || echo "unavailable")
|
| 235 |
+
send_alert "GPU_LOW" "Average GPU utilization: <b>${avg_util}%</b> (threshold: ${GPU_UTIL_WARN_PCT}%).
|
| 236 |
+
|
| 237 |
+
GPU details:
|
| 238 |
+
<code>${gpu_details}</code>"
|
| 239 |
+
return 1
|
| 240 |
+
fi
|
| 241 |
+
|
| 242 |
+
log_msg "GPU utilization OK: ${avg_util}% average."
|
| 243 |
+
return 0
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# ─── 6. Disk space check ─────────────────────────────────────────────────────
|
| 247 |
+
check_disk() {
|
| 248 |
+
local usage_pct
|
| 249 |
+
usage_pct=$(df "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}')
|
| 250 |
+
|
| 251 |
+
if [[ -z "$usage_pct" ]]; then
|
| 252 |
+
log_msg "Could not determine disk usage for $CKPT_DIR."
|
| 253 |
+
return 0
|
| 254 |
+
fi
|
| 255 |
+
|
| 256 |
+
if (( usage_pct >= DISK_WARN_PCT )); then
|
| 257 |
+
local avail
|
| 258 |
+
avail=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')
|
| 259 |
+
send_alert "DISK" "Disk usage at <b>${usage_pct}%</b> (threshold: ${DISK_WARN_PCT}%).
|
| 260 |
+
Available: <b>${avail}</b> on partition containing checkpoints.
|
| 261 |
+
|
| 262 |
+
Risk: checkpoint saves may fail. Consider deleting old checkpoints."
|
| 263 |
+
return 1
|
| 264 |
+
fi
|
| 265 |
+
|
| 266 |
+
log_msg "Disk usage OK: ${usage_pct}% used."
|
| 267 |
+
return 0
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
# ─── Main ────────────────────────────────────────────────────────────────────
|
| 271 |
+
main() {
|
| 272 |
+
log_msg "=== Watchdog check START ==="
|
| 273 |
+
|
| 274 |
+
local issues=0
|
| 275 |
+
|
| 276 |
+
check_process || (( issues++ )) || true
|
| 277 |
+
check_stall || (( issues++ )) || true
|
| 278 |
+
check_loss || (( issues++ )) || true
|
| 279 |
+
check_throughput || (( issues++ )) || true
|
| 280 |
+
check_gpu || (( issues++ )) || true
|
| 281 |
+
check_disk || (( issues++ )) || true
|
| 282 |
+
|
| 283 |
+
if (( issues == 0 )); then
|
| 284 |
+
log_msg "All checks passed — no alerts sent."
|
| 285 |
+
else
|
| 286 |
+
log_msg "Watchdog found $issues issue(s) — alerts sent."
|
| 287 |
+
fi
|
| 288 |
+
|
| 289 |
+
log_msg "=== Watchdog check END ==="
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
main "$@"
|
source/scripts/upload_to_huggingface.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Upload FRANKENSTALLM: model, eval reports, source code, and data scripts to Hugging Face.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
huggingface-cli login
|
| 6 |
+
|
| 7 |
+
# 모델 + README + 평가 결과 + 보고서
|
| 8 |
+
python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --create-pr
|
| 9 |
+
|
| 10 |
+
# 위 + 소스 코드 + 데이터 스크립트 (모델/데이터/소스 전부)
|
| 11 |
+
python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --with-source --with-data --create-pr
|
| 12 |
+
|
| 13 |
+
# 평가·보고서만
|
| 14 |
+
python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --readme-only --create-pr
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import argparse
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 21 |
+
HF_CHECKPOINT = PROJECT_ROOT / "outputs" / "hf_checkpoint-best-fixed"
|
| 22 |
+
REPORTS_DIR = PROJECT_ROOT / "reports"
|
| 23 |
+
EVAL_RESULTS_DIR = PROJECT_ROOT / "eval" / "results" / "frankenstallm-3b-v2"
|
| 24 |
+
DATA_DIR = PROJECT_ROOT / "data"
|
| 25 |
+
SOURCE_DIRS = ["train", "model", "configs", "scripts", "tokenizer", "eval"]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def main():
|
| 29 |
+
parser = argparse.ArgumentParser(description="Upload model, eval reports, source, and data scripts to Hugging Face")
|
| 30 |
+
parser.add_argument("--repo-id", type=str, required=True, help="e.g. pathcosmos/frankenstallm")
|
| 31 |
+
parser.add_argument("--readme-only", action="store_true", help="Only push README + eval results + reports (no model)")
|
| 32 |
+
parser.add_argument("--create-pr", action="store_true", help="Create a Pull Request instead of pushing to main")
|
| 33 |
+
parser.add_argument("--with-source", action="store_true", help="Upload full source code (train, eval, model, configs, scripts, tokenizer)")
|
| 34 |
+
parser.add_argument("--with-data", action="store_true", help="Upload data scripts and DATA_README (no .bin files)")
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
create_pr = getattr(args, "create_pr", False)
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
from huggingface_hub import HfApi, create_repo
|
| 40 |
+
except ImportError:
|
| 41 |
+
print("Install: pip install huggingface_hub")
|
| 42 |
+
raise SystemExit(1)
|
| 43 |
+
|
| 44 |
+
api = HfApi()
|
| 45 |
+
|
| 46 |
+
# 레포 없으면 생성
|
| 47 |
+
# 레포가 없으면 생성 (본인 계정일 때만 성공)
|
| 48 |
+
try:
|
| 49 |
+
create_repo(args.repo_id, repo_type="model", exist_ok=True)
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"Note: create_repo skipped (use Hugging Face website to create repo if needed): {e}")
|
| 52 |
+
|
| 53 |
+
if not args.readme_only:
|
| 54 |
+
if not HF_CHECKPOINT.exists():
|
| 55 |
+
print(f"Checkpoint not found: {HF_CHECKPOINT}")
|
| 56 |
+
raise SystemExit(1)
|
| 57 |
+
print(f"Uploading model from {HF_CHECKPOINT} ...")
|
| 58 |
+
api.upload_folder(
|
| 59 |
+
folder_path=str(HF_CHECKPOINT),
|
| 60 |
+
repo_id=args.repo_id,
|
| 61 |
+
repo_type="model",
|
| 62 |
+
create_pr=create_pr,
|
| 63 |
+
)
|
| 64 |
+
print("Model upload done.")
|
| 65 |
+
|
| 66 |
+
# README는 체크포인트 폴더 것 사용 (이미 평가 요약 포함)
|
| 67 |
+
readme_src = HF_CHECKPOINT / "README.md"
|
| 68 |
+
if readme_src.exists():
|
| 69 |
+
print("Pushing README (model card) ...")
|
| 70 |
+
api.upload_file(
|
| 71 |
+
path_or_fileobj=str(readme_src),
|
| 72 |
+
path_in_repo="README.md",
|
| 73 |
+
repo_id=args.repo_id,
|
| 74 |
+
repo_type="model",
|
| 75 |
+
create_pr=create_pr,
|
| 76 |
+
)
|
| 77 |
+
print("README upload done.")
|
| 78 |
+
else:
|
| 79 |
+
print("No README.md in checkpoint dir; skipping README push.")
|
| 80 |
+
|
| 81 |
+
# 평가 결과 JSON
|
| 82 |
+
results_json = EVAL_RESULTS_DIR / "ollama_benchmark_results.json"
|
| 83 |
+
if results_json.exists():
|
| 84 |
+
print("Pushing ollama_benchmark_results.json ...")
|
| 85 |
+
api.upload_file(
|
| 86 |
+
path_or_fileobj=str(results_json),
|
| 87 |
+
path_in_repo="eval/ollama_benchmark_results.json",
|
| 88 |
+
repo_id=args.repo_id,
|
| 89 |
+
repo_type="model",
|
| 90 |
+
create_pr=create_pr,
|
| 91 |
+
)
|
| 92 |
+
print("Eval results upload done.")
|
| 93 |
+
|
| 94 |
+
# 배포·평가 보고서 (상세 기록)
|
| 95 |
+
for name, src in [
|
| 96 |
+
("2026-03-09_GGUF_DEPLOYMENT_AND_EVAL_REPORT.md", REPORTS_DIR / "2026-03-09_GGUF_DEPLOYMENT_AND_EVAL_REPORT.md"),
|
| 97 |
+
("2026-03-09_ORPO_EVALUATION_REPORT.md", REPORTS_DIR / "2026-03-09_ORPO_EVALUATION_REPORT.md"),
|
| 98 |
+
]:
|
| 99 |
+
if src.exists():
|
| 100 |
+
print(f"Pushing {name} ...")
|
| 101 |
+
api.upload_file(
|
| 102 |
+
path_or_fileobj=str(src),
|
| 103 |
+
path_in_repo=f"eval_reports/{name}",
|
| 104 |
+
repo_id=args.repo_id,
|
| 105 |
+
repo_type="model",
|
| 106 |
+
create_pr=create_pr,
|
| 107 |
+
)
|
| 108 |
+
print("Reports upload done.")
|
| 109 |
+
|
| 110 |
+
# ---------- 소스 코드 (--with-source) ----------
|
| 111 |
+
if getattr(args, "with_source", False):
|
| 112 |
+
print("Uploading source code ...")
|
| 113 |
+
ignore_common = ["**/__pycache__/**", "**/*.pyc", "**/.DS_Store"]
|
| 114 |
+
for dirname in ["train", "model", "configs", "scripts", "tokenizer"]:
|
| 115 |
+
src_dir = PROJECT_ROOT / dirname
|
| 116 |
+
if src_dir.exists():
|
| 117 |
+
api.upload_folder(
|
| 118 |
+
folder_path=str(src_dir),
|
| 119 |
+
path_in_repo=f"source/{dirname}",
|
| 120 |
+
repo_id=args.repo_id,
|
| 121 |
+
repo_type="model",
|
| 122 |
+
ignore_patterns=ignore_common,
|
| 123 |
+
create_pr=create_pr,
|
| 124 |
+
)
|
| 125 |
+
print(f" source/{dirname}/ done.")
|
| 126 |
+
# eval: outputs, results 제외 (대용량)
|
| 127 |
+
eval_dir = PROJECT_ROOT / "eval"
|
| 128 |
+
if eval_dir.exists():
|
| 129 |
+
api.upload_folder(
|
| 130 |
+
folder_path=str(eval_dir),
|
| 131 |
+
path_in_repo="source/eval",
|
| 132 |
+
repo_id=args.repo_id,
|
| 133 |
+
repo_type="model",
|
| 134 |
+
ignore_patterns=ignore_common + ["**/outputs/**", "**/results/**", "**/.compile_cache/**"],
|
| 135 |
+
create_pr=create_pr,
|
| 136 |
+
)
|
| 137 |
+
print(" source/eval/ done.")
|
| 138 |
+
# 루트 문서
|
| 139 |
+
for name in ["README.md", "CLAUDE.md", "requirements.txt", "PROGRESS.md"]:
|
| 140 |
+
src_file = PROJECT_ROOT / name
|
| 141 |
+
if src_file.exists():
|
| 142 |
+
api.upload_file(
|
| 143 |
+
path_or_fileobj=str(src_file),
|
| 144 |
+
path_in_repo=f"source/{name}",
|
| 145 |
+
repo_id=args.repo_id,
|
| 146 |
+
repo_type="model",
|
| 147 |
+
create_pr=create_pr,
|
| 148 |
+
)
|
| 149 |
+
for p in PROJECT_ROOT.glob("PLAN_*.md"):
|
| 150 |
+
api.upload_file(
|
| 151 |
+
path_or_fileobj=str(p),
|
| 152 |
+
path_in_repo=f"source/{p.name}",
|
| 153 |
+
repo_id=args.repo_id,
|
| 154 |
+
repo_type="model",
|
| 155 |
+
create_pr=create_pr,
|
| 156 |
+
)
|
| 157 |
+
print("Source upload done.")
|
| 158 |
+
|
| 159 |
+
# ---------- 데이터 스크립트 (--with-data, .bin 제외) ----------
|
| 160 |
+
if getattr(args, "with_data", False) and DATA_DIR.exists():
|
| 161 |
+
print("Uploading data scripts (no .bin) ...")
|
| 162 |
+
api.upload_folder(
|
| 163 |
+
folder_path=str(DATA_DIR),
|
| 164 |
+
path_in_repo="data",
|
| 165 |
+
repo_id=args.repo_id,
|
| 166 |
+
repo_type="model",
|
| 167 |
+
ignore_patterns=[
|
| 168 |
+
"**/*.bin",
|
| 169 |
+
"**/*.chunk*",
|
| 170 |
+
"**/__pycache__/**",
|
| 171 |
+
"**/code/**",
|
| 172 |
+
"**/*.pyc",
|
| 173 |
+
],
|
| 174 |
+
create_pr=create_pr,
|
| 175 |
+
)
|
| 176 |
+
print("Data scripts upload done.")
|
| 177 |
+
|
| 178 |
+
print(f"Done. https://huggingface.co/{args.repo_id}")
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
if __name__ == "__main__":
|
| 182 |
+
main()
|