pathcosmos/frankenstallm · Upload folder using huggingface

Upload folder using huggingface_hub

#17

by somebody-to-love - opened 27 days ago

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

+7044

-0

Files changed (39) hide show

source/scripts/RESTART_GUIDE.md +23 -0
source/scripts/apply_optimizations.sh +194 -0
source/scripts/build_3b_dataset.sh +83 -0
source/scripts/check_korean_data.sh +178 -0
source/scripts/clickhouse-watchdog.sh +201 -0
source/scripts/convert_3b_gguf.sh +229 -0
source/scripts/convert_to_gguf.sh +92 -0
source/scripts/convert_to_hf.py +262 -0
source/scripts/deploy_3b_ollama.sh +146 -0
source/scripts/deploy_ollama.sh +118 -0
source/scripts/fix_tokenizer_byte_fallback.py +235 -0
source/scripts/hourly_status.sh +241 -0
source/scripts/launch_3b_orpo.sh +177 -0
source/scripts/launch_3b_pretrain.sh +258 -0
source/scripts/launch_3b_sft.sh +145 -0
source/scripts/launch_3b_sft_v2.sh +156 -0
source/scripts/launch_fp8.sh +94 -0
source/scripts/launch_hybrid_3b.sh +62 -0
source/scripts/launch_korean_1b.sh +133 -0
source/scripts/launch_korean_3b.sh +115 -0
source/scripts/launch_sft.sh +111 -0
source/scripts/migrate_qkv_checkpoint.py +230 -0
source/scripts/monitor_3b.sh +316 -0
source/scripts/monitor_training.sh +244 -0
source/scripts/openclaw-watchdog.sh +243 -0
source/scripts/orpo_eval_watchdog.sh +127 -0
source/scripts/orpo_hp_sweep.sh +166 -0
source/scripts/prepare_3b_data.sh +414 -0
source/scripts/prepare_sft_combined.sh +264 -0
source/scripts/quality_gate.sh +518 -0
source/scripts/run_eval.sh +23 -0
source/scripts/run_eval_full.sh +236 -0
source/scripts/run_eval_quick.sh +150 -0
source/scripts/run_pretrain.sh +26 -0
source/scripts/start-gateway.sh +44 -0
source/scripts/telegram_notify.py +168 -0
source/scripts/test_ollama_repetition.py +148 -0
source/scripts/training_watchdog.sh +292 -0
source/scripts/upload_to_huggingface.py +182 -0

source/scripts/RESTART_GUIDE.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FRANKENSTALLM 3B — Optimization Restart Guide
+## Quick restart (all optimizations applied automatically):
+```bash
+bash scripts/apply_optimizations.sh
+```
+## Validate only (no restart):
+```bash
+bash scripts/apply_optimizations.sh --test-only
+```
+## Manual steps if auto-migration fails:
+1. Stop: `kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)`
+2. Migrate: `python3 scripts/migrate_qkv_checkpoint.py checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX`
+3. Restart: `bash scripts/launch_3b_pretrain.sh`
+## Rollback (undo QKV fusion):
+```bash
+CKPT=checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
+cp ${CKPT}/model.pt.backup_pre_qkv ${CKPT}/model.pt
+git checkout model/attention.py  # restore original attention code
+```

source/scripts/apply_optimizations.sh ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env bash
+# =============================================================================
+# apply_optimizations.sh — Apply v2 optimizations and restart training
+#
+# Optimizations applied:
+#   1. QKV Projection Fusion (+8-12% throughput)
+#   2. NUMA CPU Affinity (fix 69% cross-NUMA workers)
+#   3. Batch size 4→5 (11h saved over full run)
+#   4. NCCL NVLS algorithm + 256MB buffers
+#   5. DDP bucket_cap_mb 400→800
+#   6. DataLoader num_workers 4→6, prefetch_factor 3→4
+#   7. MADV_RANDOM + WILLNEED for PackedDataset
+#   8. numactl --interleave=all on torchrun
+#
+# Usage:
+#   bash scripts/apply_optimizations.sh              # full migration
+#   bash scripts/apply_optimizations.sh --test-only  # just validate, don't restart
+#   bash scripts/apply_optimizations.sh --skip-stop  # don't stop current training
+# =============================================================================
+set -u
+cd "$(dirname "$0")/.."
+RUN_NAME="korean_3b_fp8_run1"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+PID_FILE="${CKPT_DIR}/train.pid"
+LOG_FILE="${CKPT_DIR}/train.log"
+TEST_ONLY=false
+SKIP_STOP=false
+for arg in "$@"; do
+    case "$arg" in
+        --test-only) TEST_ONLY=true ;;
+        --skip-stop) SKIP_STOP=true ;;
+    esac
+done
+echo "=================================================================="
+echo "  FRANKENSTALLM 3B — Optimization Migration v2"
+echo "  $(date)"
+echo "=================================================================="
+# ---- Step 1: Validate all modified files --------------------------------
+echo ""
+echo "[1/6] Validating modified files..."
+ERRORS=0
+for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do
+    if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then
+        echo "  ✓ $pyfile — syntax OK"
+    else
+        echo "  ✗ $pyfile — SYNTAX ERROR"
+        ((ERRORS++))
+    fi
+done
+if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then
+    echo "  ✓ scripts/launch_3b_pretrain.sh — syntax OK"
+else
+    echo "  ✗ scripts/launch_3b_pretrain.sh — SYNTAX ERROR"
+    ((ERRORS++))
+fi
+# Check YAML
+python3 -c "
+import yaml
+with open('configs/korean_3b_fp8.yaml') as f:
+    cfg = yaml.safe_load(f)
+assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}'
+print('  ✓ configs/korean_3b_fp8.yaml — valid, batch_size=5')
+" 2>/dev/null || { echo "  ✗ configs/korean_3b_fp8.yaml — INVALID"; ((ERRORS++)); }
+if [[ $ERRORS -gt 0 ]]; then
+    echo ""
+    echo "[ERROR] $ERRORS file(s) failed validation. Aborting."
+    exit 1
+fi
+echo "  All files validated successfully."
+if $TEST_ONLY; then
+    echo ""
+    echo "[INFO] --test-only mode. Exiting without restart."
+    exit 0
+fi
+# ---- Step 2: Stop current training (graceful) ---------------------------
+if ! $SKIP_STOP; then
+    echo ""
+    echo "[2/6] Stopping current training (SIGTERM → emergency checkpoint)..."
+    if [[ -f "$PID_FILE" ]]; then
+        PID=$(cat "$PID_FILE")
+        if kill -0 "$PID" 2>/dev/null; then
+            echo "  Sending SIGTERM to PID $PID..."
+            kill "$PID"
+            echo "  Waiting for graceful shutdown (up to 120s)..."
+            for i in $(seq 1 120); do
+                if ! kill -0 "$PID" 2>/dev/null; then
+                    echo "  Process stopped after ${i}s"
+                    break
+                fi
+                sleep 1
+            done
+            if kill -0 "$PID" 2>/dev/null; then
+                echo "  [WARN] Process still running after 120s. Force killing..."
+                kill -9 "$PID" 2>/dev/null || true
+                sleep 2
+            fi
+        else
+            echo "  Process $PID not running."
+        fi
+    else
+        echo "  No PID file found."
+    fi
+    # Wait for all GPU processes to clear
+    echo "  Waiting for GPU processes to terminate..."
+    for i in $(seq 1 30); do
+        if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then
+            echo "  All GPU processes cleared."
+            break
+        fi
+        sleep 1
+    done
+fi
+# ---- Step 3: Find and migrate latest checkpoint -------------------------
+echo ""
+echo "[3/6] Migrating latest checkpoint (QKV fusion)..."
+LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1)
+if [[ -z "$LATEST_CKPT" ]]; then
+    echo "  [ERROR] No checkpoint found!"
+    exit 1
+fi
+echo "  Latest checkpoint: $LATEST_CKPT"
+# Backup original model.pt
+cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv"
+echo "  Backup created: model.pt.backup_pre_qkv"
+# Run migration
+python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT"
+echo "  QKV fusion migration complete."
+# ---- Step 4: Quick validation test (5 steps) ----------------------------
+echo ""
+echo "[4/6] Running 5-step validation test..."
+# Use single GPU for fast test
+timeout 120 python3 train/pretrain.py \
+    --config configs/korean_3b_fp8.yaml \
+    --train_data data/3b_train.bin \
+    --checkpoint_dir /tmp/frankenstallm_test \
+    --max_steps 5 \
+    --batch_size 5 \
+    --resume "$LATEST_CKPT" \
+    2>&1 | tail -10
+TEST_EXIT=$?
+if [[ $TEST_EXIT -eq 0 ]]; then
+    echo "  ✓ 5-step test passed!"
+else
+    echo "  ✗ 5-step test FAILED (exit code $TEST_EXIT)"
+    echo "  [WARN] Restoring original checkpoint..."
+    cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt"
+    echo "  Original checkpoint restored. Aborting."
+    exit 1
+fi
+# ---- Step 5: Clean up test artifacts ------------------------------------
+echo ""
+echo "[5/6] Cleaning up test artifacts..."
+rm -rf /tmp/frankenstallm_test
+# ---- Step 6: Launch full training with optimizations --------------------
+echo ""
+echo "[6/6] Launching optimized training..."
+echo ""
+echo "  Changes applied:"
+echo "    • QKV Projection Fusion (single GEMM)"
+echo "    • NUMA CPU Affinity (cores 0-35→GPU0-3, 36-71→GPU4-7)"
+echo "    • Batch size: 4 → 5"
+echo "    • NCCL: NVLS,Ring algorithm, 256MB buffers"
+echo "    • DDP: bucket_cap_mb 400 → 800"
+echo "    • DataLoader: 4→6 workers, prefetch 3→4"
+echo "    • MADV_RANDOM + WILLNEED for dataset mmap"
+echo "    • numactl --interleave=all on torchrun"
+echo ""
+bash scripts/launch_3b_pretrain.sh
+echo ""
+echo "=================================================================="
+echo "  Migration complete! Monitor with:"
+echo "    tail -f ${LOG_FILE}"
+echo "=================================================================="

source/scripts/build_3b_dataset.sh ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd "$(dirname "$0")/.."
+DATA="data"
+echo "=================================================================="
+echo "  3B 통합 데이터셋 빌드  |  시작: $(date)"
+echo "=================================================================="
+# 청크 병합 함수
+merge_chunks() {
+    PREFIX="$1"
+    OUTPUT="$2"
+    CHUNKS=$(ls "${PREFIX}".bin.chunk* 2>/dev/null | sort || true)
+    if [[ -z "$CHUNKS" ]]; then return; fi
+    if [[ -f "$OUTPUT" ]]; then echo "  [SKIP] $OUTPUT 이미 존재"; return; fi
+    echo "  청크 병합: $(basename $PREFIX)"
+    cat $CHUNKS > "$OUTPUT"
+    echo "  완료: $(du -sh $OUTPUT | cut -f1)"
+}
+merge_chunks "$DATA/cosmo_auto_math_text_train" "$DATA/cosmo_auto_math_text_train.bin"
+merge_chunks "$DATA/cosmo_auto_math_text_val"   "$DATA/cosmo_auto_math_text_val.bin"
+merge_chunks "$DATA/cosmo_web_v2_train"         "$DATA/cosmo_web_v2_train.bin"
+merge_chunks "$DATA/cosmo_web_v2_val"           "$DATA/cosmo_web_v2_val.bin"
+TRAIN_FILES=""
+for f in \
+    "$DATA/korean_train.bin" \
+    "$DATA/hplt_ko_train.bin" \
+    "$DATA/korean_c4_train.bin" \
+    "$DATA/cc100_ko_train.bin" \
+    "$DATA/namuwiki_2023b_train.bin" \
+    "$DATA/korean_namuwiki_train.bin" \
+    "$DATA/wikipedia_ko_train.bin" \
+    "$DATA/korean_wiki_train.bin" \
+    "$DATA/open_web_math_train.bin" \
+    "$DATA/mathpile_train.bin" \
+    "$DATA/cosmo_auto_math_text_train.bin" \
+    "$DATA/cosmo_stories_train.bin" \
+    "$DATA/cosmo_web_v2_train.bin" \
+    "$DATA/cosmo_stanford_train.bin" \
+    "$DATA/cosmo_wikihow_train.bin" \
+    "$DATA/cosmo_openstax_train.bin" \
+    "$DATA/cosmo_khanacademy_train.bin"; do
+    [[ -f "$f" ]] && TRAIN_FILES="$TRAIN_FILES $f"
+done
+VAL_FILES=""
+for f in \
+    "$DATA/korean_val.bin" \
+    "$DATA/hplt_ko_val.bin" \
+    "$DATA/korean_c4_val.bin" \
+    "$DATA/cc100_ko_val.bin" \
+    "$DATA/namuwiki_2023b_val.bin" \
+    "$DATA/open_web_math_val.bin" \
+    "$DATA/mathpile_val.bin" \
+    "$DATA/cosmo_auto_math_text_val.bin" \
+    "$DATA/cosmo_stories_val.bin" \
+    "$DATA/cosmo_web_v2_val.bin"; do
+    [[ -f "$f" ]] && VAL_FILES="$VAL_FILES $f"
+done
+echo ""
+echo "train 파일 병합 → data/3b_train.bin ..."
+python3 data/merge_bins.py $TRAIN_FILES data/3b_train.bin
+echo ""
+echo "val 파일 병합 → data/3b_val.bin ..."
+python3 data/merge_bins.py $VAL_FILES data/3b_val.bin
+echo ""
+echo "=================================================================="
+du -sh data/3b_train.bin data/3b_val.bin
+python3 -c "
+import os
+sz = os.path.getsize('data/3b_train.bin')
+tok = sz // 2
+print(f'3b_train: {tok/1e9:.2f}B tokens')
+print(f'60B 달성 에포크: {60/(tok/1e9):.1f}x 반복 필요')
+"
+echo "완료: $(date)"
+echo "=================================================================="

source/scripts/check_korean_data.sh ADDED Viewed

	@@ -0,0 +1,178 @@

+#!/bin/bash
+# 한국어 학습 데이터 현황 확인 스크립트
+# 용도: 한국어 데이터셋 상태, 토크나이저, 원본 데이터 파일 확인
+set -e
+# 프로젝트 루트 (이 스크립트 실행 위치 기준)
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "${PROJECT_ROOT}"
+echo "=== 한국어 학습 데이터 현황 ==="
+echo ""
+# ============================================================================
+# 1. 학습용 바이너리 데이터 확인
+# ============================================================================
+echo "[ 학습 바이너리 데이터 ]"
+check_binary_data() {
+    local file=$1
+    local name=$2
+    if [ -f "$file" ]; then
+        local size=$(du -h "$file" | cut -f1)
+        # Python + numpy memmap으로 토큰 수 계산
+        # 바이너리는 uint32 형태로 저장되어 있음 (4 bytes per token)
+        local token_count=$(python3 -c "
+import numpy as np
+try:
+    data = np.memmap('$file', dtype=np.uint32, mode='r')
+    print(len(data))
+except Exception as e:
+    print('error')
+" 2>/dev/null || echo "error")
+        if [ "$token_count" != "error" ] && [ ! -z "$token_count" ]; then
+            # 토큰 수를 포맷팅 (천 단위 쉼표)
+            local formatted_tokens=$(printf "%'d" "$token_count")
+            # 1B 모델 학습 스텝 계산
+            # tokens_per_step = batch_size * grad_accum * seq_len * num_gpus
+            #                 = 8 * 4 * 4096 * 8 = 1,048,576 tokens/step
+            local tokens_per_step=1048576
+            local estimated_steps=$((token_count / tokens_per_step))
+            printf "  %-20s : 존재 (%s, %'d 토큰, ~%'d steps)\n" \
+                "$name" "$size" "$token_count" "$estimated_steps"
+        else
+            printf "  %-20s : 존재 (%s, 토큰 계산 실패)\n" "$name" "$size"
+        fi
+    else
+        printf "  %-20s : 없음\n" "$name"
+    fi
+}
+check_binary_data "data/korean_train.bin" "korean_train.bin"
+check_binary_data "data/korean_val.bin" "korean_val.bin"
+check_binary_data "data/train.bin" "train.bin"
+check_binary_data "data/val.bin" "val.bin"
+echo ""
+# ============================================================================
+# 2. 토크나이저 확인
+# ============================================================================
+echo "[ 토크나이저 ]"
+check_tokenizer() {
+    local dir=$1
+    local name=$2
+    if [ -d "$dir" ]; then
+        local files=$(find "$dir" -type f | wc -l)
+        printf "  %-20s : 존재 (%d개 파일)\n" "$name" "$files"
+    else
+        printf "  %-20s : 없음\n" "$name"
+    fi
+}
+check_tokenizer "tokenizer/korean_sp" "korean_sp"
+check_tokenizer "tokenizer" "default tokenizer"
+echo ""
+# ============================================================================
+# 3. 원본 데이터 디렉토리 확인
+# ============================================================================
+echo "[ 원본 데이터 ]"
+check_raw_data() {
+    local dir=$1
+    local name=$2
+    if [ -d "$dir" ]; then
+        local file_count=$(find "$dir" -maxdepth 1 -type f | wc -l)
+        local total_size=$(du -sh "$dir" 2>/dev/null | cut -f1)
+        if [ $file_count -eq 0 ]; then
+            printf "  %-20s : 없음 (디렉토리만 존재, 0 파일)\n" "$name"
+        else
+            printf "  %-20s : %'d 파일 (%s)\n" "$name" "$file_count" "$total_size"
+        fi
+    else
+        printf "  %-20s : 없음\n" "$name"
+    fi
+}
+check_raw_data "data/raw/cc100_ko" "cc100_ko/"
+check_raw_data "data/raw/c4_ko" "c4_ko/"
+check_raw_data "data/raw/namuwiki_ko" "namuwiki_ko/"
+# 위키 데이터는 raw/ 직접 하위
+echo ""
+echo "[ 위키피디아 데이터 ]"
+ko_wiki_count=$(find "data/raw" -maxdepth 1 -name "ko_wiki_*.txt" | wc -l)
+en_wiki_count=$(find "data/raw" -maxdepth 1 -name "en_wiki_*.txt" | wc -l)
+ko_wiki_size=$(du -sh "data/raw" 2>/dev/null | cut -f1)
+if [ $ko_wiki_count -gt 0 ]; then
+    printf "  %-20s : %'d 파일\n" "ko_wiki" "$ko_wiki_count"
+fi
+if [ $en_wiki_count -gt 0 ]; then
+    printf "  %-20s : %'d 파일\n" "en_wiki" "$en_wiki_count"
+fi
+echo ""
+# ============================================================================
+# 4. 종합 상태 요약
+# ============================================================================
+echo "[ 종합 상태 ]"
+# 학습용 바이너리 데이터 확인
+binary_ready=false
+if [ -f "data/korean_train.bin" ] && [ -f "data/korean_val.bin" ]; then
+    binary_ready=true
+elif [ -f "data/train.bin" ] && [ -f "data/val.bin" ]; then
+    binary_ready=true
+fi
+# 토크나이저 확인
+tokenizer_ready=false
+if [ -d "tokenizer/korean_sp" ] && [ -f "tokenizer/korean_sp/tokenizer.model" ]; then
+    tokenizer_ready=true
+fi
+# 원본 데이터 확인
+raw_ready=false
+if [ -d "data/raw/c4_ko" ] || [ -d "data/raw/namuwiki_ko" ] || [ -d "data/raw/cc100_ko" ]; then
+    count=$(find "data/raw/c4_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)
+    count=$((count + $(find "data/raw/namuwiki_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
+    count=$((count + $(find "data/raw/cc100_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
+    if [ $count -gt 0 ]; then
+        raw_ready=true
+    fi
+fi
+printf "  학습용 바이너리     : %s\n" "$([ "$binary_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
+printf "  토크나이저          : %s\n" "$([ "$tokenizer_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
+printf "  원본 데이터         : %s\n" "$([ "$raw_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
+echo ""
+# ============================================================================
+# 5. 학습 설정 파라미터 정보
+# ============================================================================
+echo "[ 학습 설정 (1B 모델 기준) ]"
+echo "  배치 사이즈         : 8"
+echo "  시퀀스 길이         : 4096"
+echo "  GPU 수              : 8"
+echo "  그래디언트 누적     : 4"
+echo "  토큰/스텝           : 8 × 4 × 4096 × 8 = 1,048,576"
+echo ""
+echo "=== 검사 완료 ==="

source/scripts/clickhouse-watchdog.sh ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env bash
+#
+# clickhouse-watchdog.sh — ClickHouse 헬스체크 + 자동 재시작
+# crontab에 등록하여 1분마다 실행
+#
+# Usage:
+#   */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/clickhouse-watchdog.sh
+#
+set -euo pipefail
+# ── 설정 ──────────────────────────────────────────────
+CH_BIN="/PROJECT/0325120031_A/ghong/taketimes/clickhouse-bin"
+CH_CONFIG="/PROJECT/0325120031_A/ghong/taketimes/llm-bang/configs/clickhouse-config.xml"
+TCP_PORT=9000
+HTTP_PORT=8123
+HOST="127.0.0.1"
+LOG_DIR="/tmp/clickhouse"
+LOG_FILE="${LOG_DIR}/watchdog.log"
+MAX_LOG_SIZE=$((10 * 1024 * 1024))  # 10MB 로테이션
+RESTART_COOLDOWN=180  # 초 — 재시작 후 이 시간 내 재시도 방지
+LAST_RESTART_FILE="/tmp/clickhouse-last-restart"
+HEALTH_CHECK_TIMEOUT=5  # 초 — 헬스체크 curl/query 타임아웃
+# ── 함수 ──────────────────────────────────────────────
+mkdir -p "$LOG_DIR"
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] [clickhouse-watchdog] $*" >> "$LOG_FILE"
+}
+rotate_log() {
+    local file="$1"
+    if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
+        mv "$file" "${file}.old"
+        log "Log rotated: $file"
+    fi
+}
+is_tcp_port_open() {
+    if command -v ss &>/dev/null; then
+        ss -tlnH "sport = :${TCP_PORT}" 2>/dev/null | grep -q "$TCP_PORT"
+    else
+        (echo > /dev/tcp/"$HOST"/"$TCP_PORT") 2>/dev/null
+    fi
+}
+is_http_responding() {
+    # HTTP 인터페이스 핑 — ClickHouse는 GET / 에 "Ok.\n" 응답
+    if command -v curl &>/dev/null; then
+        local resp
+        resp=$(curl -s --max-time "$HEALTH_CHECK_TIMEOUT" "http://${HOST}:${HTTP_PORT}/ping" 2>/dev/null || true)
+        [[ "$resp" == "Ok." ]]
+    else
+        # curl 없으면 TCP 포트만 확인
+        (echo > /dev/tcp/"$HOST"/"$HTTP_PORT") 2>/dev/null
+    fi
+}
+is_process_alive() {
+    # ClickHouse 내부 watchdog 프로세스명: "clickhouse-watchdog" (바이너리 자체)
+    # 이 스크립트(clickhouse-watchdog.sh)와 구분하기 위해 --daemon 플래그 포함 패턴 사용
+    pgrep -f "clickhouse.*server.*--daemon" >/dev/null 2>&1
+}
+can_execute_query() {
+    # 실제 쿼리 실행으로 서버가 응답하는지 확인
+    local result
+    result=$("$CH_BIN" client --port "$TCP_PORT" --query "SELECT 1" 2>/dev/null || true)
+    [[ "$result" == "1" ]]
+}
+cooldown_active() {
+    if [[ -f "$LAST_RESTART_FILE" ]]; then
+        local last_restart now diff
+        last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
+        now=$(date +%s)
+        diff=$(( now - last_restart ))
+        if [[ $diff -lt $RESTART_COOLDOWN ]]; then
+            return 0  # 쿨다운 중
+        fi
+    fi
+    return 1  # 쿨다운 아님
+}
+stop_existing() {
+    log "Stopping existing ClickHouse processes..."
+    local my_pid=$$
+    local pids
+    # 정상 종료 시도 (서버 프로세스)
+    pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
+    if [[ -n "$pids" ]]; then
+        log "Sending TERM to PIDs: $pids"
+        echo "$pids" | xargs kill -TERM 2>/dev/null || true
+        sleep 3
+        # 아직 살아있으면 강제 종료
+        pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
+        if [[ -n "$pids" ]]; then
+            log "Force killing PIDs: $pids"
+            echo "$pids" | xargs kill -9 2>/dev/null || true
+            sleep 2
+        fi
+    fi
+}
+start_server() {
+    log "Starting ClickHouse server (daemon mode)..."
+    # 기존 프로세스 정리
+    stop_existing
+    # 필요한 디렉토리 생성
+    mkdir -p /tmp/clickhouse/logs
+    mkdir -p /tmp/clickhouse-tmp
+    # 데몬 모드로 시작
+    "$CH_BIN" server --config-file="$CH_CONFIG" --daemon
+    # 시작 후 대기 + 확인 (최대 15초)
+    local attempts=0
+    local max_attempts=15
+    while [[ $attempts -lt $max_attempts ]]; do
+        sleep 1
+        attempts=$((attempts + 1))
+        if is_tcp_port_open && can_execute_query; then
+            date +%s > "$LAST_RESTART_FILE"
+            log "ClickHouse started successfully (took ${attempts}s)"
+            return 0
+        fi
+    done
+    date +%s > "$LAST_RESTART_FILE"
+    log "ERROR: ClickHouse did not respond within ${max_attempts}s after start"
+    return 1
+}
+# ── 메인 로직 ─────────────────────────────────────────
+rotate_log "$LOG_FILE"
+# 1) 바이너리 존재 확인
+if [[ ! -x "$CH_BIN" ]]; then
+    log "FATAL: ClickHouse binary not found or not executable: $CH_BIN"
+    exit 1
+fi
+# 2) 프로세스 + 포트 + 쿼리 체크
+process_ok=false
+port_ok=false
+query_ok=false
+if is_process_alive; then
+    process_ok=true
+fi
+if is_tcp_port_open; then
+    port_ok=true
+fi
+if $port_ok && can_execute_query; then
+    query_ok=true
+fi
+# 3) 판단
+if $process_ok && $port_ok && $query_ok; then
+    # 완전 정상 — 아무것도 안 함
+    exit 0
+fi
+# HTTP도 확인 (진단 로그용)
+http_ok=false
+if is_http_responding; then
+    http_ok=true
+fi
+# 비정상 상태 로깅
+if $process_ok && $port_ok && ! $query_ok; then
+    log "WARN: Process alive, port open, but query failed. Possible hung state."
+elif $process_ok && ! $port_ok; then
+    log "WARN: Process alive but TCP port $TCP_PORT not listening."
+elif ! $process_ok; then
+    log "WARN: ClickHouse is completely down (no process found)."
+fi
+log "Status: process=$process_ok port=$port_ok query=$query_ok http=$http_ok"
+# 4) 쿨다운 체크
+if cooldown_active; then
+    log "Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
+    exit 0
+fi
+# 5) 재시작
+log "Attempting ClickHouse restart..."
+if start_server; then
+    log "ClickHouse restart SUCCESS"
+else
+    log "ClickHouse restart FAILED"
+    exit 1
+fi

source/scripts/convert_3b_gguf.sh ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env bash
+# =============================================================================
+# convert_3b_gguf.sh — 3B 모델 HuggingFace → GGUF 변환 + 다중 양자화
+#
+# Usage:
+#   bash scripts/convert_3b_gguf.sh [options]
+#
+# Options:
+#   --input_dir  DIR   HF 포맷 모델 디렉토리 (default: outputs/hf_korean_3b_orpo)
+#   --out_dir    DIR   GGUF 출력 디렉토리    (default: outputs/gguf)
+#   --checkpoint DIR   커스텀 체크포인트 디렉토리 (지정 시 HF 변환 선행 실행)
+#   --skip_hf_conv     HF 변환 단계 건너뜀 (이미 HF 포맷 존재 시)
+#   --skip_quant       양자화 단계 건너뜀 (F16 GGUF만 생성)
+#
+# Pipeline:
+#   1. [선택] 커스텀 체크포인트 → HF transformers 포맷 (convert_to_hf.py)
+#   2. HF → F16 GGUF (llama.cpp/convert_hf_to_gguf.py)
+#   3. F16 GGUF → Q4_K_M, Q5_K_M, Q8_0 양자화 (llama-quantize)
+#
+# Outputs:
+#   outputs/gguf/frankenstallm-3b-f16.gguf
+#   outputs/gguf/frankenstallm-3b-Q4_K_M.gguf   — 권장 (Ollama용)
+#   outputs/gguf/frankenstallm-3b-Q5_K_M.gguf
+#   outputs/gguf/frankenstallm-3b-Q8_0.gguf
+#
+# 전제 조건:
+#   - python scripts/convert_to_hf.py 로 HF 변환 완료 (또는 --checkpoint 옵션)
+#   - git, cmake, make 설치
+#   - pip install safetensors
+# =============================================================================
+set -euo pipefail
+# ---------------------------------------------------------------------------
+# 인자 파싱
+# ---------------------------------------------------------------------------
+INPUT_DIR="outputs/hf_korean_3b_orpo"
+OUT_DIR="outputs/gguf"
+CHECKPOINT_DIR=""
+SKIP_HF_CONV=false
+SKIP_QUANT=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --input_dir)   INPUT_DIR="$2";      shift 2 ;;
+        --out_dir)     OUT_DIR="$2";        shift 2 ;;
+        --checkpoint)  CHECKPOINT_DIR="$2"; shift 2 ;;
+        --skip_hf_conv) SKIP_HF_CONV=true; shift ;;
+        --skip_quant)   SKIP_QUANT=true;   shift ;;
+        -h|--help)
+            grep '^#' "$0" | head -40 | sed 's/^# \{0,1\}//'
+            exit 0 ;;
+        *)
+            echo "ERROR: 알 수 없는 옵션: $1"
+            echo "Usage: bash scripts/convert_3b_gguf.sh [--input_dir DIR] [--out_dir DIR] [--checkpoint DIR] [--skip_hf_conv] [--skip_quant]"
+            exit 1 ;;
+    esac
+done
+PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+LLAMA_CPP_DIR="${LLAMA_CPP_DIR:-$PROJECT_DIR/outputs/llama.cpp}"
+MODEL_NAME="frankenstallm-3b"
+cd "$PROJECT_DIR"
+echo "=================================================================="
+echo "  3B 모델 GGUF 변환 파이프라인"
+echo "  입력 HF 디렉토리 : $INPUT_DIR"
+echo "  GGUF 출력 디렉토리: $OUT_DIR"
+echo "  llama.cpp 경로    : $LLAMA_CPP_DIR"
+echo "=================================================================="
+echo ""
+# ---------------------------------------------------------------------------
+# Step 0: llama.cpp 존재 여부 확인 / 클론
+# ---------------------------------------------------------------------------
+if [[ ! -d "$LLAMA_CPP_DIR" ]]; then
+    echo "[SETUP] llama.cpp 디렉토리가 없습니다."
+    echo "        다음 명령으로 설치하세요:"
+    echo ""
+    echo "        git clone --depth 1 https://github.com/ggerganov/llama.cpp $LLAMA_CPP_DIR"
+    echo ""
+    echo "        또는 LLAMA_CPP_DIR 환경변수로 기존 경로를 지정하세요:"
+    echo "        LLAMA_CPP_DIR=/path/to/llama.cpp bash scripts/convert_3b_gguf.sh"
+    echo ""
+    read -r -p "지금 자동 클론하시겠습니까? [y/N] " _yn
+    if [[ "${_yn:-N}" =~ ^[Yy]$ ]]; then
+        echo "Cloning llama.cpp ..."
+        git clone --depth 1 https://github.com/ggerganov/llama.cpp "$LLAMA_CPP_DIR"
+    else
+        echo "중단합니다. llama.cpp를 설치한 뒤 다시 실행하세요."
+        exit 1
+    fi
+fi
+# llama.cpp Python 의존성
+echo "[SETUP] llama.cpp Python 의존성 설치 중 ..."
+pip install -r "$LLAMA_CPP_DIR/requirements.txt" --break-system-packages -q
+# ---------------------------------------------------------------------------
+# Step 1: 커스텀 체크포인트 → HF 포맷 변환 (선택)
+# ---------------------------------------------------------------------------
+if [[ -n "$CHECKPOINT_DIR" && "$SKIP_HF_CONV" == "false" ]]; then
+    echo ""
+    echo "[STEP 1] 커스텀 체크포인트 → HF 포맷 변환"
+    echo "  체크포인트: $CHECKPOINT_DIR"
+    echo "  출력      : $INPUT_DIR"
+    echo ""
+    if [[ ! -d "$CHECKPOINT_DIR" ]]; then
+        echo "ERROR: 체크포인트 디렉토리를 찾을 수 없습니다: $CHECKPOINT_DIR"
+        exit 1
+    fi
+    python "$PROJECT_DIR/scripts/convert_to_hf.py" \
+        --checkpoint "$CHECKPOINT_DIR" \
+        --output "$INPUT_DIR" \
+        --tokenizer "tokenizer/korean_sp/tokenizer.json"
+    echo "  [OK] HF 변환 완료 → $INPUT_DIR"
+elif [[ "$SKIP_HF_CONV" == "true" ]]; then
+    echo "[STEP 1] HF 변환 건너뜀 (--skip_hf_conv)"
+else
+    echo "[STEP 1] 체크포인트 미지정 — HF 디렉토리를 직접 사용합니다."
+fi
+# HF 디렉토리 최종 검증
+if [[ ! -d "$INPUT_DIR" ]]; then
+    echo "ERROR: HF 모델 디렉토리를 찾을 수 없습니다: $INPUT_DIR"
+    echo "  --checkpoint 옵션으로 체크포인트를 지정하거나,"
+    echo "  python scripts/convert_to_hf.py 를 먼저 실행하세요."
+    exit 1
+fi
+if [[ ! -f "$INPUT_DIR/config.json" ]]; then
+    echo "ERROR: config.json 이 없습니다: $INPUT_DIR/config.json"
+    exit 1
+fi
+mkdir -p "$OUT_DIR"
+# ---------------------------------------------------------------------------
+# Step 2: llama.cpp 빌드 (llama-quantize 바이너리)
+# ---------------------------------------------------------------------------
+QUANTIZE_BIN="$LLAMA_CPP_DIR/build/bin/llama-quantize"
+if [[ ! -f "$QUANTIZE_BIN" ]]; then
+    echo ""
+    echo "[STEP 2] llama.cpp 빌드 중 (llama-quantize) ..."
+    cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DGGML_CUDA=ON \
+        2>&1 | tail -10
+    cmake --build "$LLAMA_CPP_DIR/build" --target llama-quantize -j "$(nproc)" \
+        2>&1 | tail -10
+    echo "  [OK] 빌드 완료: $QUANTIZE_BIN"
+else
+    echo "[STEP 2] llama-quantize 바이너리 이미 존재 — 빌드 건너뜀"
+fi
+# ---------------------------------------------------------------------------
+# Step 3: HF → F16 GGUF 변환
+# ---------------------------------------------------------------------------
+F16_GGUF="$OUT_DIR/${MODEL_NAME}-f16.gguf"
+echo ""
+echo "[STEP 3] HF → F16 GGUF 변환"
+echo "  입력: $INPUT_DIR"
+echo "  출력: $F16_GGUF"
+echo ""
+python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" "$INPUT_DIR" \
+    --outfile "$F16_GGUF" \
+    --outtype f16
+echo "  [OK] F16 GGUF 크기: $(du -sh "$F16_GGUF" | cut -f1)  ($F16_GGUF)"
+# ---------------------------------------------------------------------------
+# Step 4: 다중 양자화 (Q4_K_M, Q5_K_M, Q8_0)
+# ---------------------------------------------------------------------------
+if [[ "$SKIP_QUANT" == "true" ]]; then
+    echo ""
+    echo "[STEP 4] 양자화 건너뜀 (--skip_quant)"
+else
+    echo ""
+    echo "[STEP 4] 다중 양자화 시작 ..."
+    if [[ ! -f "$QUANTIZE_BIN" ]]; then
+        echo "[WARN] llama-quantize 바이너리를 찾을 수 없습니다: $QUANTIZE_BIN"
+        echo "       양자화를 건너뜁니다. F16 GGUF만 생성되었습니다."
+        echo "       수동 빌드: cmake --build $LLAMA_CPP_DIR/build --target llama-quantize"
+    else
+        # Q4_K_M — 가장 작은 크기, 품질/속도 균형 (Ollama 기본 권장)
+        Q4KM_GGUF="$OUT_DIR/${MODEL_NAME}-Q4_K_M.gguf"
+        echo "  → Q4_K_M 양자화: $Q4KM_GGUF ..."
+        "$QUANTIZE_BIN" "$F16_GGUF" "$Q4KM_GGUF" Q4_K_M
+        echo "     크기: $(du -sh "$Q4KM_GGUF" | cut -f1)"
+        # Q5_K_M — 중간 크기, 더 높은 품질
+        Q5KM_GGUF="$OUT_DIR/${MODEL_NAME}-Q5_K_M.gguf"
+        echo "  → Q5_K_M 양자화: $Q5KM_GGUF ..."
+        "$QUANTIZE_BIN" "$F16_GGUF" "$Q5KM_GGUF" Q5_K_M
+        echo "     크기: $(du -sh "$Q5KM_GGUF" | cut -f1)"
+        # Q8_0 — 가장 높은 품질 (F16 근사)
+        Q8_GGUF="$OUT_DIR/${MODEL_NAME}-Q8_0.gguf"
+        echo "  → Q8_0 양자화: $Q8_GGUF ..."
+        "$QUANTIZE_BIN" "$F16_GGUF" "$Q8_GGUF" Q8_0
+        echo "     크기: $(du -sh "$Q8_GGUF" | cut -f1)"
+        echo ""
+        echo "  [OK] 모든 양자화 완료"
+    fi
+fi
+# ---------------------------------------------------------------------------
+# 완료 요약
+# ---------------------------------------------------------------------------
+echo ""
+echo "=================================================================="
+echo "  3B GGUF 변환 완료"
+echo ""
+echo "  출력 파일 목록:"
+ls -lh "$OUT_DIR/${MODEL_NAME}"*.gguf 2>/dev/null | awk '{print "    " $5 "  " $9}' || \
+    echo "    (파일 목록 확인: ls -lh $OUT_DIR/)"
+echo ""
+echo "  다음 단계:"
+echo "    bash scripts/deploy_3b_ollama.sh"
+echo "    bash scripts/quality_gate.sh deploy"
+echo "=================================================================="

source/scripts/convert_to_gguf.sh ADDED Viewed

	@@ -0,0 +1,92 @@

+#!/usr/bin/env bash
+# =============================================================================
+# convert_to_gguf.sh — HuggingFace 포맷 모델을 GGUF로 변환 + Q4_K_M 양자화
+#
+# Usage:
+#   bash scripts/convert_to_gguf.sh [hf_dir] [out_dir]
+#
+#   hf_dir  : HF 포맷 모델 디렉토리 (default: outputs/hf)
+#   out_dir : GGUF 출력 디렉토리    (default: outputs/gguf)
+#
+# Outputs:
+#   outputs/gguf/korean-1b-f16.gguf    — F16 GGUF
+#   outputs/gguf/korean-1b-q4km.gguf   — Q4_K_M 양자화 (Ollama용)
+#
+# 전제 조건:
+#   - python scripts/convert_to_hf.py 로 HF 변환 완료
+#   - git, cmake, make 설치
+#   - pip install safetensors (없으면 pytorch_model.bin으로 fallback)
+# =============================================================================
+set -euo pipefail
+HF_DIR="${1:-outputs/hf}"
+OUT_DIR="${2:-outputs/gguf}"
+LLAMA_CPP_DIR="outputs/llama.cpp"
+PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$PROJECT_DIR"
+# --- Pre-flight check -------------------------------------------------------
+if [[ ! -d "$HF_DIR" ]]; then
+    echo "ERROR: HF model directory not found: $HF_DIR"
+    echo "Run first: python scripts/convert_to_hf.py --checkpoint <ckpt> --output $HF_DIR"
+    exit 1
+fi
+if [[ ! -f "$HF_DIR/config.json" ]]; then
+    echo "ERROR: config.json not found in $HF_DIR"
+    exit 1
+fi
+mkdir -p "$OUT_DIR"
+# --- Clone llama.cpp if not present -----------------------------------------
+if [[ ! -d "$LLAMA_CPP_DIR" ]]; then
+    echo "Cloning llama.cpp ..."
+    git clone --depth 1 https://github.com/ggerganov/llama.cpp "$LLAMA_CPP_DIR"
+fi
+# Install Python requirements for conversion script
+echo "Installing llama.cpp Python deps ..."
+pip install -r "$LLAMA_CPP_DIR/requirements.txt" --break-system-packages -q
+# --- Build llama.cpp (for quantization binary) ------------------------------
+QUANTIZE_BIN="$LLAMA_CPP_DIR/build/bin/llama-quantize"
+if [[ ! -f "$QUANTIZE_BIN" ]]; then
+    echo "Building llama.cpp (quantization tool) ..."
+    cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DGGML_CUDA=ON \
+        2>&1 | tail -5
+    cmake --build "$LLAMA_CPP_DIR/build" --target llama-quantize -j "$(nproc)" \
+        2>&1 | tail -5
+fi
+# --- F16 GGUF conversion ---------------------------------------------------
+F16_GGUF="$OUT_DIR/korean-1b-f16.gguf"
+echo "Converting to F16 GGUF: $F16_GGUF ..."
+python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" "$HF_DIR" \
+    --outfile "$F16_GGUF" \
+    --outtype f16
+echo "F16 GGUF size: $(du -sh "$F16_GGUF" | cut -f1)"
+# --- Q4_K_M quantization ---------------------------------------------------
+Q4KM_GGUF="$OUT_DIR/korean-1b-q4km.gguf"
+if [[ -f "$QUANTIZE_BIN" ]]; then
+    echo "Quantizing to Q4_K_M: $Q4KM_GGUF ..."
+    "$QUANTIZE_BIN" "$F16_GGUF" "$Q4KM_GGUF" Q4_K_M
+    echo "Q4_K_M GGUF size: $(du -sh "$Q4KM_GGUF" | cut -f1)"
+else
+    echo "[WARN] llama-quantize binary not found. Using F16 GGUF for Ollama."
+    echo "       Build: cmake --build $LLAMA_CPP_DIR/build --target llama-quantize"
+    cp "$F16_GGUF" "$Q4KM_GGUF"
+fi
+echo ""
+echo "=================================================================="
+echo "  GGUF 변환 완료"
+echo "  F16 : $F16_GGUF"
+echo "  Q4KM: $Q4KM_GGUF"
+echo "  다음 단계: bash scripts/deploy_ollama.sh"
+echo "=================================================================="

source/scripts/convert_to_hf.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Convert custom LLM checkpoint to HuggingFace LlamaForCausalLM format.
+Usage:
+    python scripts/convert_to_hf.py \\
+        --checkpoint checkpoints/korean_1b_fp8_run1/checkpoint-0034000 \\
+        --output outputs/hf \\
+        [--tokenizer tokenizer/korean_sp/tokenizer.json]
+Outputs (in --output directory):
+    config.json          — LlamaConfig
+    model.safetensors    — converted weights
+    tokenizer.json       — tokenizer (copied)
+    tokenizer_config.json
+    generation_config.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import shutil
+import sys
+from pathlib import Path
+import torch
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(_PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(_PROJECT_ROOT))
+from model.config import LMConfig
+def remap_weights(
+    src_state_dict: dict,
+    config: LMConfig,
+) -> dict:
+    """
+    Remap custom LLM weight names to HuggingFace LlamaForCausalLM names.
+    Handles both FP8 (te.LayerNormMLP / te.Linear) and BF16 (SwiGLU / nn.Linear)
+    checkpoints transparently.
+    """
+    dst = {}
+    is_fp8 = config.use_fp8
+    # --- Token embedding ---
+    dst["model.embed_tokens.weight"] = src_state_dict["embedding.weight"].float()
+    for i in range(config.n_layers):
+        pfx = f"layers.{i}"
+        hpfx = f"model.layers.{i}"
+        # Attention norm (always RMSNorm)
+        dst[f"{hpfx}.input_layernorm.weight"] = (
+            src_state_dict[f"{pfx}.attn_norm.weight"].float()
+        )
+        # Attention projections
+        # Handle fused QKV (te.Linear with qkv_proj) vs separate q/k/v
+        qkv_key = f"{pfx}.attn.qkv_proj.weight"
+        if qkv_key in src_state_dict:
+            # Fused QKV: [Q_dim + K_dim + V_dim, d_model]
+            # GQA: Q = n_heads * head_dim, K = V = n_kv_heads * head_dim
+            qkv = src_state_dict[qkv_key].float()
+            head_dim = config.d_model // config.n_heads
+            q_dim = config.n_heads * head_dim      # e.g. 24 * 128 = 3072
+            k_dim = config.n_kv_heads * head_dim    # e.g. 8 * 128 = 1024
+            v_dim = config.n_kv_heads * head_dim    # e.g. 8 * 128 = 1024
+            assert qkv.shape[0] == q_dim + k_dim + v_dim, (
+                f"QKV shape mismatch: {qkv.shape[0]} != {q_dim}+{k_dim}+{v_dim}"
+            )
+            dst[f"{hpfx}.self_attn.q_proj.weight"] = qkv[:q_dim]
+            dst[f"{hpfx}.self_attn.k_proj.weight"] = qkv[q_dim:q_dim + k_dim]
+            dst[f"{hpfx}.self_attn.v_proj.weight"] = qkv[q_dim + k_dim:]
+        else:
+            # Separate q/k/v projections
+            for src_name, dst_name in [
+                ("q_proj", "self_attn.q_proj"),
+                ("k_proj", "self_attn.k_proj"),
+                ("v_proj", "self_attn.v_proj"),
+            ]:
+                w_key = f"{pfx}.attn.{src_name}.weight"
+                if w_key in src_state_dict:
+                    dst[f"{hpfx}.{dst_name}.weight"] = src_state_dict[w_key].float()
+        # Output projection
+        out_key = f"{pfx}.attn.out_proj.weight"
+        if out_key in src_state_dict:
+            dst[f"{hpfx}.self_attn.o_proj.weight"] = src_state_dict[out_key].float()
+        # FFN — FP8 (te.LayerNormMLP) vs BF16 (SwiGLU)
+        if is_fp8 and f"{pfx}.ffn.layer_norm_weight" in src_state_dict:
+            # te.LayerNormMLP: RMSNorm is fused inside
+            dst[f"{hpfx}.post_attention_layernorm.weight"] = (
+                src_state_dict[f"{pfx}.ffn.layer_norm_weight"].float()
+            )
+            # fc1_weight: [2*d_ffn, d_model] — gate and up are concatenated
+            fc1 = src_state_dict[f"{pfx}.ffn.fc1_weight"].float()
+            half = fc1.shape[0] // 2
+            dst[f"{hpfx}.mlp.gate_proj.weight"] = fc1[:half]
+            dst[f"{hpfx}.mlp.up_proj.weight"] = fc1[half:]
+            # fc2_weight: [d_model, d_ffn]
+            dst[f"{hpfx}.mlp.down_proj.weight"] = (
+                src_state_dict[f"{pfx}.ffn.fc2_weight"].float()
+            )
+        else:
+            # Standard SwiGLU (BF16 checkpoint)
+            dst[f"{hpfx}.post_attention_layernorm.weight"] = (
+                src_state_dict[f"{pfx}.ffn_norm.weight"].float()
+            )
+            dst[f"{hpfx}.mlp.gate_proj.weight"] = (
+                src_state_dict[f"{pfx}.ffn.gate_proj.weight"].float()
+            )
+            dst[f"{hpfx}.mlp.up_proj.weight"] = (
+                src_state_dict[f"{pfx}.ffn.up_proj.weight"].float()
+            )
+            dst[f"{hpfx}.mlp.down_proj.weight"] = (
+                src_state_dict[f"{pfx}.ffn.down_proj.weight"].float()
+            )
+    # --- Final norm and LM head ---
+    dst["model.norm.weight"] = src_state_dict["norm.weight"].float()
+    # Weight tying: embedding.weight == lm_head.weight in our model.
+    # HF LlamaForCausalLM expects lm_head.weight explicitly.
+    dst["lm_head.weight"] = src_state_dict["embedding.weight"].float().clone()
+    return dst
+def build_llama_config(config: LMConfig) -> dict:
+    """Map LMConfig fields to HuggingFace LlamaConfig dict."""
+    return {
+        "architectures": ["LlamaForCausalLM"],
+        "model_type": "llama",
+        "hidden_size": config.d_model,
+        "intermediate_size": config.d_ffn,
+        "num_hidden_layers": config.n_layers,
+        "num_attention_heads": config.n_heads,
+        "num_key_value_heads": config.n_kv_heads,
+        "hidden_act": "silu",
+        "max_position_embeddings": config.max_seq_len,
+        "initializer_range": 0.02,
+        "rms_norm_eps": 1e-5,
+        "vocab_size": config.vocab_size,
+        "rope_theta": config.rope_theta,
+        "rope_scaling": None,
+        "attention_bias": config.bias,
+        "tie_word_embeddings": True,
+        "torch_dtype": "float16",
+        "transformers_version": "4.40.0",
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert custom LLM checkpoint to HuggingFace LlamaForCausalLM format."
+    )
+    parser.add_argument(
+        "--checkpoint",
+        required=True,
+        type=Path,
+        help="Path to checkpoint directory (must contain model.pt + config.yaml).",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        type=Path,
+        help="Output directory for HF-format files.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=Path,
+        default=Path("tokenizer/korean_sp/tokenizer.json"),
+        help="Path to tokenizer.json (default: tokenizer/korean_sp/tokenizer.json).",
+    )
+    args = parser.parse_args()
+    ckpt_path = args.checkpoint
+    out_path = args.output
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+    out_path.mkdir(parents=True, exist_ok=True)
+    print(f"Checkpoint : {ckpt_path}")
+    print(f"Output     : {out_path}")
+    # Load config
+    config = LMConfig.from_yaml(ckpt_path / "config.yaml")
+    print(f"Model      : d_model={config.d_model}, n_layers={config.n_layers}, "
+          f"vocab_size={config.vocab_size}, use_fp8={config.use_fp8}")
+    # Load weights
+    print("Loading model.pt ...")
+    state_dict = torch.load(
+        ckpt_path / "model.pt",
+        map_location="cpu",
+        weights_only=True,
+    )
+    print(f"  Source keys: {len(state_dict)}")
+    # Remap
+    print("Remapping weight names ...")
+    hf_state_dict = remap_weights(state_dict, config)
+    print(f"  Destination keys: {len(hf_state_dict)}")
+    # Save safetensors
+    print("Saving model.safetensors ...")
+    try:
+        from safetensors.torch import save_file
+        save_file(hf_state_dict, out_path / "model.safetensors")
+    except ImportError:
+        print("  [WARN] safetensors not installed; falling back to pytorch_model.bin")
+        torch.save(hf_state_dict, out_path / "pytorch_model.bin")
+    # Save config.json
+    llama_cfg = build_llama_config(config)
+    with open(out_path / "config.json", "w", encoding="utf-8") as f:
+        json.dump(llama_cfg, f, indent=2, ensure_ascii=False)
+    print("Saved config.json")
+    # Save generation_config.json
+    gen_cfg = {
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "pad_token_id": 0,
+        "max_new_tokens": 512,
+        "temperature": 0.8,
+        "top_p": 0.9,
+        "do_sample": True,
+    }
+    with open(out_path / "generation_config.json", "w", encoding="utf-8") as f:
+        json.dump(gen_cfg, f, indent=2, ensure_ascii=False)
+    # Copy tokenizer
+    tok_src = args.tokenizer
+    if tok_src.exists():
+        shutil.copy(tok_src, out_path / "tokenizer.json")
+        # Minimal tokenizer_config.json for HF compatibility
+        tok_cfg = {
+            "model_type": "llama",
+            "tokenizer_class": "PreTrainedTokenizerFast",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+            "clean_up_tokenization_spaces": False,
+        }
+        with open(out_path / "tokenizer_config.json", "w", encoding="utf-8") as f:
+            json.dump(tok_cfg, f, indent=2, ensure_ascii=False)
+        print(f"Copied tokenizer: {tok_src} -> {out_path / 'tokenizer.json'}")
+    else:
+        print(f"[WARN] Tokenizer not found at {tok_src}. Copy manually.")
+    print(f"\nDone! HF model saved to: {out_path}")
+    print("Verify: ls -lh", out_path)
+if __name__ == "__main__":
+    main()

source/scripts/deploy_3b_ollama.sh ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env bash
+# =============================================================================
+# deploy_3b_ollama.sh — 3B GGUF 모델을 Ollama에 등록 & 자동 테스트
+#
+# Usage:
+#   bash scripts/deploy_3b_ollama.sh [model_name]
+#
+#   model_name: Ollama 모델 이름 (default: frankenstallm-3b)
+#
+# 전제 조건:
+#   - ollama 설치: https://ollama.com/download
+#   - bash scripts/convert_3b_gguf.sh 실행 완료
+#   - outputs/gguf/frankenstallm-3b-Q4_K_M.gguf 존재
+#   - Modelfile.3b 존재
+# =============================================================================
+set -euo pipefail
+MODEL_NAME="${1:-frankenstallm-3b}"
+PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+MODELFILE="$PROJECT_DIR/Modelfile.3b"
+GGUF_PATH="$PROJECT_DIR/outputs/gguf/frankenstallm-3b-Q4_K_M.gguf"
+cd "$PROJECT_DIR"
+# ---------------------------------------------------------------------------
+# Pre-flight check
+# ---------------------------------------------------------------------------
+if ! command -v ollama &> /dev/null; then
+    echo "ERROR: ollama가 설치되어 있지 않습니다."
+    echo "설치: curl -fsSL https://ollama.com/install.sh | sh"
+    exit 1
+fi
+if [[ ! -f "$GGUF_PATH" ]]; then
+    echo "ERROR: GGUF 파일을 찾을 수 없습니다: $GGUF_PATH"
+    echo "먼저 실행: bash scripts/convert_3b_gguf.sh"
+    exit 1
+fi
+if [[ ! -f "$MODELFILE" ]]; then
+    echo "ERROR: Modelfile.3b 를 찾을 수 없습니다: $MODELFILE"
+    echo "  프로젝트 루트에 Modelfile.3b 가 있어야 합니다."
+    exit 1
+fi
+echo "=================================================================="
+echo "  3B 모델 Ollama 배포"
+echo "  모델명   : $MODEL_NAME"
+echo "  GGUF     : $(du -sh "$GGUF_PATH" | cut -f1)  ($GGUF_PATH)"
+echo "  Modelfile: $MODELFILE"
+echo "=================================================================="
+echo ""
+# ---------------------------------------------------------------------------
+# Ollama 서버 실행 확인
+# ---------------------------------------------------------------------------
+if ! ollama list &>/dev/null; then
+    echo "[WARN] Ollama 서버가 응답하지 않습니다. 백그라운드로 시작합니다 ..."
+    ollama serve &>/tmp/ollama_serve.log &
+    OLLAMA_PID=$!
+    echo "  PID: $OLLAMA_PID  (로그: /tmp/ollama_serve.log)"
+    # 서버 준비 대기 (최대 15초)
+    for i in $(seq 1 15); do
+        if ollama list &>/dev/null 2>&1; then
+            echo "  [OK] Ollama 서버 준비 완료 (${i}초)"
+            break
+        fi
+        sleep 1
+    done
+fi
+# ---------------------------------------------------------------------------
+# Ollama 모델 등록
+# ---------------------------------------------------------------------------
+echo "[1/2] Ollama 모델 등록 중: $MODEL_NAME ..."
+ollama create "$MODEL_NAME" -f "$MODELFILE"
+echo "  [OK] 등록 완료"
+# ---------------------------------------------------------------------------
+# 자동 테스트 프롬프트 5개 실행
+# ---------------------------------------------------------------------------
+echo ""
+echo "[2/2] 자동 테스트 프롬프트 실행 (5개) ..."
+echo ""
+declare -a TEST_PROMPTS=(
+    "안녕하세요! 간단히 자기소개를 해주세요."
+    "대한민국의 수도는 어디인가요? 그 도시의 특징을 설명해주세요."
+    "파이썬으로 피보나치 수열을 출력하는 함수를 작성해주세요."
+    "인공지능이 사회에 미치는 긍정적인 영향 3가지를 설명해주세요."
+    "오늘 저녁 메뉴로 무엇을 추천해주시겠어요? 이유도 함께 말씀해주세요."
+)
+PASS_COUNT=0
+FAIL_COUNT=0
+TOTAL=${#TEST_PROMPTS[@]}
+for i in "${!TEST_PROMPTS[@]}"; do
+    PROMPT="${TEST_PROMPTS[$i]}"
+    NUM=$((i + 1))
+    echo "--- 테스트 $NUM/$TOTAL ---"
+    echo "프롬프트: $PROMPT"
+    echo ""
+    # ollama run: 타임아웃 60초, 응답 첫 300자만 표시
+    if RESPONSE=$(timeout 60 ollama run "$MODEL_NAME" "$PROMPT" 2>&1); then
+        RESP_PREVIEW="${RESPONSE:0:300}"
+        echo "응답: $RESP_PREVIEW"
+        if [[ ${#RESPONSE} -gt 300 ]]; then
+            echo "      ... (총 ${#RESPONSE}자)"
+        fi
+        echo "[OK] 테스트 $NUM 성공"
+        PASS_COUNT=$((PASS_COUNT + 1))
+    else
+        EXIT_CODE=$?
+        echo "[FAIL] 테스트 $NUM 실패 (exit code: $EXIT_CODE)"
+        FAIL_COUNT=$((FAIL_COUNT + 1))
+    fi
+    echo ""
+done
+# ---------------------------------------------------------------------------
+# 결과 요약
+# ---------------------------------------------------------------------------
+echo "=================================================================="
+echo "  배포 & 테스트 완료"
+echo ""
+echo "  모델명  : $MODEL_NAME"
+echo "  테스트  : $PASS_COUNT/$TOTAL 성공  ($FAIL_COUNT 실패)"
+echo ""
+if [[ $FAIL_COUNT -eq 0 ]]; then
+    echo "  [PASS] 모든 테스트 통과"
+else
+    echo "  [WARN] 일부 테스트 실패 — 로그를 확인하��요"
+fi
+echo ""
+echo "  Ollama 사용법:"
+echo "    ollama run $MODEL_NAME"
+echo "    ollama run $MODEL_NAME '질문을 여기에 입력하세요'"
+echo "    ollama rm $MODEL_NAME   (삭제)"
+echo ""
+echo "  Quality Gate:"
+echo "    bash scripts/quality_gate.sh deploy"
+echo "=================================================================="
+[[ $FAIL_COUNT -gt 0 ]] && exit 1 || exit 0

source/scripts/deploy_ollama.sh ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env bash
+# =============================================================================
+# deploy_ollama.sh — FRANKENSTALLM 3B GGUF → Ollama 원클릭 배포
+#
+# Usage:
+#   bash scripts/deploy_ollama.sh              # 기본 (Q4_K_M)
+#   bash scripts/deploy_ollama.sh --quant Q8_0 # Q8_0 양자화
+#   bash scripts/deploy_ollama.sh --skip_convert  # GGUF 이미 존재 시
+#
+# Pipeline:
+#   1. [선택] GGUF 변환 + 양자화 (convert_3b_gguf.sh)
+#   2. Ollama 설치 확인 / 서버 시작
+#   3. Modelfile.3b로 모델 등록
+#   4. 자동 테스트 (5개 프롬프트)
+#   5. 반복률 검증 (15개 프롬프트)
+# =============================================================================
+set -euo pipefail
+QUANT="${QUANT:-Q4_K_M}"
+MODEL_NAME="frankenstallm-3b"
+SKIP_CONVERT=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --quant)       QUANT="$2";        shift 2 ;;
+        --skip_convert) SKIP_CONVERT=true; shift ;;
+        -h|--help)
+            grep '^#' "$0" | head -20 | sed 's/^# \{0,1\}//'
+            exit 0 ;;
+        *) echo "ERROR: 알 수 없는 옵션: $1"; exit 1 ;;
+    esac
+done
+PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$PROJECT_DIR"
+GGUF_PATH="outputs/gguf/frankenstallm-3b-${QUANT}.gguf"
+MODELFILE="Modelfile.3b"
+echo "=================================================================="
+echo "  FRANKENSTALLM 3B Ollama 배포"
+echo "  양자화  : $QUANT"
+echo "  GGUF    : $GGUF_PATH"
+echo "  Modelfile: $MODELFILE"
+echo "=================================================================="
+# ---- Step 1: GGUF 변환 (필요 시) ----
+if [[ "$SKIP_CONVERT" == "false" ]]; then
+    if [[ ! -f "$GGUF_PATH" ]]; then
+        echo ""
+        echo "[Step 1] GGUF 변환 실행 중 ..."
+        bash scripts/convert_3b_gguf.sh \
+            --input_dir checkpoints/korean_3b_orpo_v1/checkpoint-9840
+    else
+        echo "[Step 1] GGUF 파일 이미 존재 — 변환 건너뜀"
+    fi
+else
+    echo "[Step 1] 변환 건너뜀 (--skip_convert)"
+fi
+if [[ ! -f "$GGUF_PATH" ]]; then
+    echo "ERROR: GGUF 파일 없음: $GGUF_PATH"
+    exit 1
+fi
+echo "  GGUF 크기: $(du -sh "$GGUF_PATH" | cut -f1)"
+# ---- Step 2: Ollama 설치 확인 ----
+if ! command -v ollama &>/dev/null; then
+    echo ""
+    echo "[Step 2] Ollama 미설치 — 설치 중 ..."
+    curl -fsSL https://ollama.com/install.sh | sh
+fi
+# Ollama 서버 시작
+if ! ollama list &>/dev/null 2>&1; then
+    echo "[Step 2] Ollama 서버 시작 중 ..."
+    ollama serve &>/tmp/ollama_serve.log &
+    for i in $(seq 1 15); do
+        if ollama list &>/dev/null 2>&1; then
+            echo "  [OK] Ollama 서버 준비 (${i}초)"
+            break
+        fi
+        sleep 1
+    done
+fi
+# ---- Step 3: 모델 등록 ----
+echo ""
+echo "[Step 3] Ollama 모델 등록: $MODEL_NAME"
+ollama create "$MODEL_NAME" -f "$MODELFILE"
+echo "  [OK] 등록 완료"
+# ---- Step 4: 자동 테스트 ----
+echo ""
+echo "[Step 4] 자동 테스트 ..."
+declare -a QUICK_TESTS=(
+    "대한민국의 수도는?"
+    "인공지능이란 무엇인가요?"
+    "한국의 전통 음식 중에서 김치에 대해 설명해주세요."
+)
+for prompt in "${QUICK_TESTS[@]}"; do
+    echo "  Q: $prompt"
+    RESP=$(timeout 60 ollama run "$MODEL_NAME" "$prompt" 2>&1 || echo "[TIMEOUT/ERROR]")
+    echo "  A: ${RESP:0:200}"
+    echo ""
+done
+# ---- Step 5: 반복률 검증 ----
+echo "[Step 5] 반복률 검증 (15개 프롬프트) ..."
+python3 scripts/test_ollama_repetition.py --model "$MODEL_NAME"
+echo ""
+echo "=================================================================="
+echo "  배포 완료!"
+echo "  사용법: ollama run $MODEL_NAME"
+echo "=================================================================="

source/scripts/fix_tokenizer_byte_fallback.py ADDED Viewed

	@@ -0,0 +1,235 @@

+#!/usr/bin/env python3
+"""Fix GGUF newline crash by adding byte-fallback tokens to the tokenizer.
+Problem: The SentencePiece Unigram tokenizer was trained without byte_fallback=True,
+so characters like \n have no token representation. llama.cpp crashes when it
+encounters these characters because there's no byte-fallback.
+Fix:
+  1. Add 256 byte-fallback tokens (<0x00> .. <0xFF>) to tokenizer.json
+  2. Resize model embeddings from 64000 -> 64256
+  3. Update config.json vocab_size
+  4. Copy tokenizer.model for proper GGUF conversion
+Usage:
+    python scripts/fix_tokenizer_byte_fallback.py \
+        --input outputs/hf_checkpoint-best \
+        --output outputs/hf_checkpoint-best-fixed \
+        --sp_model tokenizer/korean_sp/tokenizer.model
+"""
+import argparse
+import json
+import shutil
+from pathlib import Path
+import torch
+from safetensors.torch import load_file, save_file
+BYTE_FALLBACK_COUNT = 256
+BYTE_TOKEN_TEMPLATE = "<0x{:02X}>"
+def fix_tokenizer_json(input_path: Path, output_path: Path):
+    """Add byte_fallback=True and 256 byte tokens to tokenizer.json."""
+    with open(input_path) as f:
+        tok = json.load(f)
+    model = tok["model"]
+    vocab = model["vocab"]  # list of [piece, score]
+    original_size = len(vocab)
+    # Enable byte_fallback
+    model["byte_fallback"] = True
+    # Add 256 byte tokens with very low score (they're fallback only)
+    for i in range(BYTE_FALLBACK_COUNT):
+        byte_token = BYTE_TOKEN_TEMPLATE.format(i)
+        vocab.append([byte_token, 0.0])
+    new_size = len(vocab)
+    print(f"  Vocab: {original_size} -> {new_size} (+{BYTE_FALLBACK_COUNT} byte tokens)")
+    print(f"  byte_fallback: False -> True")
+    # Also add byte tokens to added_tokens list
+    added = tok.get("added_tokens", [])
+    for i in range(BYTE_FALLBACK_COUNT):
+        byte_token = BYTE_TOKEN_TEMPLATE.format(i)
+        added.append({
+            "id": original_size + i,
+            "content": byte_token,
+            "single_word": False,
+            "lstrip": False,
+            "rstrip": False,
+            "normalized": False,
+            "special": True,
+        })
+    tok["added_tokens"] = added
+    with open(output_path, "w") as f:
+        json.dump(tok, f, ensure_ascii=False, indent=2)
+    return original_size, new_size
+def fix_config_json(input_path: Path, output_path: Path, new_vocab_size: int):
+    """Update vocab_size in config.json."""
+    with open(input_path) as f:
+        config = json.load(f)
+    old_size = config["vocab_size"]
+    config["vocab_size"] = new_vocab_size
+    print(f"  config.json vocab_size: {old_size} -> {new_vocab_size}")
+    with open(output_path, "w") as f:
+        json.dump(config, f, indent=2)
+def resize_embeddings(input_path: Path, output_path: Path,
+                      old_vocab: int, new_vocab: int, tie_embeddings: bool):
+    """Resize embedding and lm_head weights to accommodate new tokens."""
+    print(f"  Loading model weights from {input_path} ...")
+    state_dict = load_file(str(input_path))
+    embed_key = "model.embed_tokens.weight"
+    lm_head_key = "lm_head.weight"
+    if embed_key not in state_dict:
+        raise KeyError(f"{embed_key} not found in state_dict. Keys: {list(state_dict.keys())[:10]}")
+    embed = state_dict[embed_key]
+    print(f"  embed_tokens shape: {embed.shape}")
+    hidden_size = embed.shape[1]
+    extra = new_vocab - old_vocab
+    # Initialize new embeddings as mean of existing (better than random for byte tokens)
+    mean_embed = embed.mean(dim=0, keepdim=True)
+    # Add small noise to avoid identical embeddings
+    noise = torch.randn(extra, hidden_size, dtype=embed.dtype) * 0.01
+    new_rows = mean_embed.expand(extra, -1) + noise
+    new_embed = torch.cat([embed, new_rows], dim=0)
+    state_dict[embed_key] = new_embed
+    print(f"  embed_tokens resized: {embed.shape} -> {new_embed.shape}")
+    if tie_embeddings:
+        # When tie_word_embeddings=True, lm_head shares embed_tokens
+        # Remove lm_head if present (it will be tied automatically)
+        if lm_head_key in state_dict:
+            del state_dict[lm_head_key]
+            print(f"  lm_head removed (tie_word_embeddings=True)")
+    else:
+        if lm_head_key in state_dict:
+            lm_head = state_dict[lm_head_key]
+            mean_lm = lm_head.mean(dim=0, keepdim=True)
+            noise_lm = torch.randn(extra, hidden_size, dtype=lm_head.dtype) * 0.01
+            new_lm = torch.cat([lm_head, mean_lm.expand(extra, -1) + noise_lm], dim=0)
+            state_dict[lm_head_key] = new_lm
+            print(f"  lm_head resized: {lm_head.shape} -> {new_lm.shape}")
+    print(f"  Saving to {output_path} ...")
+    save_file(state_dict, str(output_path))
+def main():
+    parser = argparse.ArgumentParser(description="Fix tokenizer byte-fallback for GGUF")
+    parser.add_argument("--input", type=Path, required=True, help="Input HF checkpoint dir")
+    parser.add_argument("--output", type=Path, required=True, help="Output fixed HF checkpoint dir")
+    parser.add_argument("--sp_model", type=Path, default=None,
+                        help="SentencePiece .model file to copy (for GGUF conversion)")
+    args = parser.parse_args()
+    input_dir = args.input
+    output_dir = args.output
+    if not input_dir.exists():
+        print(f"ERROR: Input directory not found: {input_dir}")
+        return 1
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Load config to check tie_word_embeddings
+    with open(input_dir / "config.json") as f:
+        config = json.load(f)
+    old_vocab = config["vocab_size"]
+    new_vocab = old_vocab + BYTE_FALLBACK_COUNT
+    tie_embeddings = config.get("tie_word_embeddings", False)
+    print(f"=== Byte-Fallback Fix ===")
+    print(f"Input:  {input_dir}")
+    print(f"Output: {output_dir}")
+    print(f"Old vocab: {old_vocab}, New vocab: {new_vocab}")
+    print(f"tie_word_embeddings: {tie_embeddings}")
+    print()
+    # 1. Fix tokenizer.json
+    print("[1/4] Fixing tokenizer.json ...")
+    fix_tokenizer_json(
+        input_dir / "tokenizer.json",
+        output_dir / "tokenizer.json",
+    )
+    # 2. Fix config.json
+    print("[2/4] Fixing config.json ...")
+    fix_config_json(
+        input_dir / "config.json",
+        output_dir / "config.json",
+        new_vocab,
+    )
+    # 3. Resize model weights
+    print("[3/4] Resizing embeddings ...")
+    resize_embeddings(
+        input_dir / "model.safetensors",
+        output_dir / "model.safetensors",
+        old_vocab, new_vocab, tie_embeddings,
+    )
+    # 4. Copy other files
+    print("[4/4] Copying remaining files ...")
+    for fname in ["tokenizer_config.json", "generation_config.json"]:
+        src = input_dir / fname
+        if src.exists():
+            shutil.copy2(src, output_dir / fname)
+            print(f"  Copied {fname}")
+    # Copy SentencePiece model if provided (needed for GGUF conversion)
+    if args.sp_model and args.sp_model.exists():
+        shutil.copy2(args.sp_model, output_dir / "tokenizer.model")
+        print(f"  Copied tokenizer.model from {args.sp_model}")
+    elif (input_dir / "tokenizer.model").exists():
+        shutil.copy2(input_dir / "tokenizer.model", output_dir / "tokenizer.model")
+        print(f"  Copied tokenizer.model from input dir")
+    # Update tokenizer_config.json to add added_tokens_decoder for byte tokens
+    tc_path = output_dir / "tokenizer_config.json"
+    if tc_path.exists():
+        with open(tc_path) as f:
+            tc = json.load(f)
+        added_tokens_decoder = tc.get("added_tokens_decoder", {})
+        for i in range(BYTE_FALLBACK_COUNT):
+            token_id = old_vocab + i
+            byte_token = BYTE_TOKEN_TEMPLATE.format(i)
+            added_tokens_decoder[str(token_id)] = {
+                "content": byte_token,
+                "lstrip": False,
+                "normalized": False,
+                "rstrip": False,
+                "single_word": False,
+                "special": True,
+            }
+        tc["added_tokens_decoder"] = added_tokens_decoder
+        with open(tc_path, "w") as f:
+            json.dump(tc, f, indent=2)
+        print(f"  Updated tokenizer_config.json with {BYTE_FALLBACK_COUNT} byte tokens")
+    print()
+    print(f"=== Done! Fixed checkpoint at: {output_dir} ===")
+    print(f"Next: python outputs/llama.cpp/convert_hf_to_gguf.py {output_dir} --outfile outputs/gguf/frankenstallm-3b-f16.gguf --outtype f16")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

source/scripts/hourly_status.sh ADDED Viewed

	@@ -0,0 +1,241 @@

+#!/usr/bin/env bash
+# =============================================================================
+# hourly_status.sh — FRANKENSTALLM 3B Hourly Training Status Report (Telegram)
+# Run: every hour via cron
+# Sends a rich formatted message with progress, loss, ETA, GPU/disk summary.
+# =============================================================================
+set -euo pipefail
+# ─── Paths ───────────────────────────────────────────────────────────────────
+WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
+CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
+LOG_FILE="$CKPT_DIR/train.log"
+PID_FILE="$CKPT_DIR/train.pid"
+HOURLY_LOG="$CKPT_DIR/hourly_status.log"
+NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"
+TOTAL_STEPS=57000
+TOTAL_TOKENS_B=114   # billion tokens target (57K steps × batch)
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+ts()    { date '+%Y-%m-%d %H:%M:%S'; }
+log()   { echo "[$(ts)] $*"; }
+# Safely get last matching value from log
+parse_last() {
+    local pattern="$1"
+    grep -oP "$pattern" "$LOG_FILE" 2>/dev/null | tail -1 || echo ""
+}
+# ─── Parse training log ───────────────────────────────────────────────────────
+parse_log() {
+    if [[ ! -f "$LOG_FILE" ]]; then
+        echo "NO_LOG"
+        return 1
+    fi
+    # Get the last step line
+    LAST_LINE=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1 || echo "")
+    if [[ -z "$LAST_LINE" ]]; then
+        echo "NO_STEPS"
+        return 1
+    fi
+    CURRENT_STEP=$(echo "$LAST_LINE" | grep -oP 'step\s+\K[0-9]+' || echo "0")
+    CURRENT_LOSS=$(echo "$LAST_LINE" | grep -oP 'loss\s+\K[0-9.]+' || echo "N/A")
+    CURRENT_LR=$(echo "$LAST_LINE" | grep -oP 'lr\s+\K[0-9.e+-]+' || echo "N/A")
+    CURRENT_GNORM=$(echo "$LAST_LINE" | grep -oP 'gnorm\s+\K[0-9.]+' || echo "N/A")
+    CURRENT_TOKPS=$(echo "$LAST_LINE" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "0")
+    CURRENT_MEM=$(echo "$LAST_LINE" | grep -oP 'mem\s+\K[0-9.]+GB' || echo "N/A")
+    CURRENT_EPOCH=$(echo "$LAST_LINE" | grep -oP 'epoch\s+\K[0-9]+' || echo "0")
+    # Log timestamp — parse from the line itself
+    LOG_TS=$(echo "$LAST_LINE" | grep -oP '\[\K[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' || echo "unknown")
+    return 0
+}
+# ─── Calculate progress & ETA ─────────────────────────────────────────────────
+compute_eta() {
+    local step="$1"
+    local tokps="$2"
+    # Progress
+    PROGRESS_PCT=$(echo "scale=1; $step * 100 / $TOTAL_STEPS" | bc -l 2>/dev/null || echo "0")
+    # Steps remaining
+    STEPS_LEFT=$(( TOTAL_STEPS - step ))
+    # Tokens processed so far (approx: step × 2M tokens/step for 3B, bs=4, seqlen=4096, 8gpu)
+    # bs=4, accum=8, 8gpu → effective batch = 4*8*8=256 sequences × 4096 tokens = 1,048,576 ≈ 1M tok/step
+    TOKENS_PROCESSED_B=$(echo "scale=2; $step * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
+    # ETA using current tok/s
+    if [[ "$tokps" -gt 0 ]]; then
+        # tokens remaining
+        local tokens_left_b
+        tokens_left_b=$(echo "scale=2; ($TOTAL_STEPS - $step) * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
+        local tokens_left
+        tokens_left=$(echo "scale=0; ($TOTAL_STEPS - $step) * 1048576" | bc -l 2>/dev/null || echo "0")
+        local secs_left
+        secs_left=$(echo "scale=0; $tokens_left / $tokps" | bc -l 2>/dev/null || echo "0")
+        ETA_HOURS=$(echo "scale=1; $secs_left / 3600" | bc -l 2>/dev/null || echo "N/A")
+        if [[ "$ETA_HOURS" != "N/A" ]]; then
+            local eta_epoch
+            eta_epoch=$(( $(date +%s) + secs_left ))
+            ETA_DATETIME=$(date -d "@$eta_epoch" '+%m/%d %H:%M' 2>/dev/null || echo "N/A")
+        else
+            ETA_DATETIME="N/A"
+        fi
+    else
+        ETA_HOURS="N/A"
+        ETA_DATETIME="N/A"
+    fi
+}
+# ─── GPU summary ─────────────────────────────────────────────────────────────
+get_gpu_summary() {
+    if ! command -v nvidia-smi &>/dev/null; then
+        GPU_SUMMARY="nvidia-smi not available"
+        GPU_AVG_UTIL="N/A"
+        GPU_TOTAL_MEM="N/A"
+        return
+    fi
+    local raw
+    raw=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
+        --format=csv,noheader,nounits 2>/dev/null || echo "")
+    if [[ -z "$raw" ]]; then
+        GPU_SUMMARY="GPU query failed"
+        GPU_AVG_UTIL="N/A"
+        GPU_TOTAL_MEM="N/A"
+        return
+    fi
+    # avg util
+    GPU_AVG_UTIL=$(echo "$raw" | awk -F', ' '{sum+=$2; count++} END {printf "%.0f%%", sum/count}')
+    # total mem used / total
+    GPU_TOTAL_MEM=$(echo "$raw" | awk -F', ' \
+        '{used+=$3; total+=$4} END {printf "%.1f / %.1f GiB", used/1024, total/1024}')
+    # Per-GPU one-liner: "G0:95% 48G | G1:94% 48G | ..."
+    GPU_SUMMARY=$(echo "$raw" | awk -F', ' \
+        '{printf "G%s:%s%% %sMiB | ", $1, $2, $3}' | sed 's/ | $//')
+}
+# ─── Disk usage ──────────────────────────────────────────────────────────────
+get_disk_info() {
+    DISK_INFO=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {printf "%s used / %s total (%s)", $3, $2, $5}' || echo "N/A")
+    CKPT_COUNT=$(ls -d "$CKPT_DIR"/checkpoint-* 2>/dev/null | wc -l || echo "0")
+    LAST_CKPT=$(ls -dt "$CKPT_DIR"/checkpoint-* 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo "none")
+}
+# ─── Process status ───────────────────────────────────────────────────────────
+get_process_status() {
+    PROC_STATUS="UNKNOWN"
+    if [[ -f "$PID_FILE" ]]; then
+        local pid
+        pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
+        if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
+            PROC_STATUS="RUNNING (PID $pid)"
+        else
+            PROC_STATUS="STOPPED (PID $pid)"
+        fi
+    else
+        PROC_STATUS="NO PID FILE"
+    fi
+}
+# ─── Build & send message ────────────────────────────────────────────────────
+build_and_send() {
+    local step="$CURRENT_STEP"
+    local loss="$CURRENT_LOSS"
+    local tokps="$CURRENT_TOKPS"
+    # Status icon
+    local status_icon
+    if [[ "$PROC_STATUS" == RUNNING* ]]; then
+        status_icon="&#9989;"    # green check
+    else
+        status_icon="&#10060;"   # red X
+    fi
+    # Progress bar (20 chars)
+    local bar_filled=$(echo "scale=0; $PROGRESS_PCT * 20 / 100" | bc -l 2>/dev/null || echo "0")
+    local bar_empty=$(( 20 - bar_filled ))
+    PROGRESS_BAR=$(printf '%0.s&#9608;' $(seq 1 $bar_filled 2>/dev/null) ; printf '%0.s&#9617;' $(seq 1 $bar_empty 2>/dev/null)) || PROGRESS_BAR="[$PROGRESS_PCT%]"
+    local msg
+    msg="$(cat <<EOF
+<b>FRANKENSTALLM 3B — Hourly Status</b>
+<i>$(ts)</i>
+$status_icon <b>Process:</b> $PROC_STATUS
+<b>Progress</b>
+Step: <code>$step / $TOTAL_STEPS</code>  ($PROGRESS_PCT%)
+Tokens: <code>${TOKENS_PROCESSED_B}B / ${TOTAL_TOKENS_B}B</code>
+Epoch: <code>$CURRENT_EPOCH</code>
+Last log: <code>$LOG_TS</code>
+<b>Training Metrics</b>
+Loss:   <code>$loss</code>
+LR:     <code>$CURRENT_LR</code>
+Gnorm:  <code>$CURRENT_GNORM</code>
+Tok/s:  <code>$tokps</code>
+Mem:    <code>$CURRENT_MEM</code>
+<b>ETA</b>
+Steps left: <code>$STEPS_LEFT</code>
+Remaining:  <code>~$ETA_HOURS h</code>
+Est. done:  <code>$ETA_DATETIME</code>
+<b>GPU</b>
+Avg util: <code>$GPU_AVG_UTIL</code>
+Total mem: <code>$GPU_TOTAL_MEM</code>
+<b>Checkpoints</b>
+Last saved: <code>$LAST_CKPT</code>
+Total: <code>$CKPT_COUNT</code> checkpoints
+<b>Disk</b>
+<code>$DISK_INFO</code>
+EOF
+)"
+    log "Sending hourly status report (step $step)..."
+    $NOTIFY "$msg" || {
+        log "ERROR: Failed to send Telegram message."
+        return 1
+    }
+    log "Status report sent."
+}
+# ─── Main ────────────────────────────────────────────────────────────────────
+main() {
+    log "=== Hourly status START ==="
+    parse_log || {
+        log "Cannot parse log — sending minimal status."
+        $NOTIFY "<b>FRANKENSTALLM 3B</b> — Status check at $(ts)
+<b>WARNING:</b> Cannot read training log at:
+<code>$LOG_FILE</code>
+Process status: $(cat "$PID_FILE" 2>/dev/null && echo "(PID found)" || echo "(no PID file)")" || true
+        return 0
+    }
+    compute_eta "$CURRENT_STEP" "$CURRENT_TOKPS"
+    get_gpu_summary
+    get_disk_info
+    get_process_status
+    build_and_send
+    log "=== Hourly status END ==="
+}
+main "$@"

source/scripts/launch_3b_orpo.sh ADDED Viewed

	@@ -0,0 +1,177 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_3b_orpo.sh — 8-GPU ORPO fine-tuning launcher for Korean 3B LLM
+#
+# Usage:
+#   bash scripts/launch_3b_orpo.sh                      # 기본 실행
+#   bash scripts/launch_3b_orpo.sh --max_steps 200      # 빠른 테스트
+#   RUN_NAME=my_orpo bash scripts/launch_3b_orpo.sh     # 이름 지정
+#
+# 기반 모델 : eval/outputs/hf_3b_sft_best  (SFT v1 best)
+# 데이터    : data/preference/combined_preference.jsonl
+# 출력      : checkpoints/korean_3b_orpo_v1/
+# 로그      : checkpoints/korean_3b_orpo_v1/train.log
+#
+# 체크포인트 크기 예상:
+#   model weights:    ~6GB (bf16)
+#   optimizer states: ~24GB
+#   총 ~30GB/개 × max 5개 = 150GB
+# =============================================================================
+set -euo pipefail
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_3b_orpo_v1}"
+BASE_MODEL="${BASE_MODEL:-eval/outputs/hf_3b_sft_best}"
+DATA_PATH="${DATA_PATH:-data/preference/combined_preference.jsonl}"
+OUTPUT_DIR="checkpoints/${RUN_NAME}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29502}"
+# ORPO 하이퍼파라미터
+BATCH_SIZE=4
+GRAD_ACCUM=4
+LR=1.2e-5
+BETA=0.25
+EPOCHS=2
+MAX_LENGTH=1536
+WARMUP_RATIO=0.05
+WEIGHT_DECAY=0.01
+EVAL_SPLIT_RATIO=0.05
+EVAL_STEPS=500
+EARLY_STOPPING_PATIENCE=3
+SAVE_TOTAL_LIMIT=5
+SEED=42
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
+# (launch_3b_pretrain.sh와 동일한 NCCL 설정 유지)
+export NCCL_IB_DISABLE=1
+export NCCL_PROTO=Simple
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+# ORPO forward-backward 패스는 pretrain보다 메모리 변동이 크므로 버퍼 128MB 유지
+export NCCL_BUFFSIZE=134217728
+export OMP_NUM_THREADS=9
+export MKL_NUM_THREADS=9
+# OOM 방지: 메모리 단편화 완화 (ORPO는 chosen/rejected 동시 forward → 메모리 민감)
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# P2P NVLink 직접 통신 활성화
+export NCCL_P2P_LEVEL=NVL
+# Ring + Tree 병행 (3B gradient 크기 기준)
+export NCCL_ALGO=Ring,Tree
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+cd "$(dirname "$0")/.."
+# ---- Pre-flight checks ------------------------------------------------------
+if [[ ! -d "${BASE_MODEL}" ]]; then
+    echo "ERROR: 기반 모델 디렉토리 없음: ${BASE_MODEL}"
+    echo "       SFT 완료 후 HF 포맷으로 변환했는지 확인하세요."
+    echo "       예: python scripts/convert_to_hf.py --checkpoint <sft_ckpt> --output ${BASE_MODEL}"
+    exit 1
+fi
+if [[ ! -f "${DATA_PATH}" ]]; then
+    echo "ERROR: 학습 데이터 없음: ${DATA_PATH}"
+    echo "       먼저 데이터 통합 스크립트를 실행하세요:"
+    echo "       python data/prepare_preference_combined.py"
+    exit 1
+fi
+if [[ ! -f "train/orpo.py" ]]; then
+    echo "ERROR: train/orpo.py 없음"
+    exit 1
+fi
+# GPU 메모리 체크
+GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
+if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 40000 ]]; then
+    echo "WARNING: GPU 메모리 ${GPU_MEM}MB < 40GB. ORPO 3B 학습에 부족할 수 있음."
+fi
+# 중복 프로세스 방지
+EXISTING_PID=$(pgrep -f "orpo.py.*${RUN_NAME}" 2>/dev/null | head -1 || true)
+if [[ -n "$EXISTING_PID" ]]; then
+    echo "ERROR: 이미 ORPO 프로세스 실행 중 (PID: ${EXISTING_PID})"
+    echo "       kill ${EXISTING_PID} 로 먼저 종료하세요."
+    exit 1
+fi
+# 디스크 여유 확인 (최소 200GB)
+AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}' || echo "0")
+if [[ -n "$AVAIL_KB" && "$AVAIL_KB" -gt 0 && "$AVAIL_KB" -lt 209715200 ]]; then
+    AVAIL_GB=$(echo "scale=1; $AVAIL_KB / 1048576" | bc 2>/dev/null || echo "?")
+    echo "WARNING: /PROJECT 여유 ${AVAIL_GB}GB < 200GB. 체크포인트 저장 공간 부족 가능."
+fi
+mkdir -p "${CKPT_DIR}" "${OUTPUT_DIR}"
+# ---- 데이터 레코드 수 확인 --------------------------------------------------
+DATA_LINES=$(wc -l < "${DATA_PATH}" 2>/dev/null || echo "?")
+echo "  학습 데이터 레코드 수: ${DATA_LINES}"
+# ---- 유효 배치 크기 계산 ----------------------------------------------------
+EFF_BATCH=$((BATCH_SIZE * NPROC * GRAD_ACCUM))
+echo "=================================================================="
+echo "  Korean 3B LLM ORPO Fine-Tuning"
+echo "  Run name        : ${RUN_NAME}"
+echo "  Base model      : ${BASE_MODEL}"
+echo "  Data            : ${DATA_PATH}  (${DATA_LINES} records)"
+echo "  Output dir      : ${OUTPUT_DIR}"
+echo "  CKPT dir        : ${CKPT_DIR}"
+echo "  Log file        : ${LOG_FILE}"
+echo "  Epochs          : ${EPOCHS}"
+echo "  LR              : ${LR}"
+echo "  Beta (ORPO)     : ${BETA}"
+echo "  Batch           : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} accum = ${EFF_BATCH}"
+echo "  Max length      : ${MAX_LENGTH}"
+echo "  Weight decay    : ${WEIGHT_DECAY}"
+echo "  Eval steps      : ${EVAL_STEPS}"
+echo "  Early stop      : patience=${EARLY_STOPPING_PATIENCE}"
+echo "  Started         : $(date)"
+echo "=================================================================="
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/orpo.py \
+    --model_path "${BASE_MODEL}" \
+    --custom_data_path "${DATA_PATH}" \
+    --output_dir "${OUTPUT_DIR}" \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --beta ${BETA} \
+    --batch_size ${BATCH_SIZE} \
+    --gradient_accumulation_steps ${GRAD_ACCUM} \
+    --max_length ${MAX_LENGTH} \
+    --weight_decay ${WEIGHT_DECAY} \
+    --eval_split_ratio ${EVAL_SPLIT_RATIO} \
+    --eval_steps ${EVAL_STEPS} \
+    --early_stopping_patience ${EARLY_STOPPING_PATIENCE} \
+    --save_total_limit ${SAVE_TOTAL_LIMIT} \
+    ${EXTRA_ARGS} \
+    2>&1 | tee "${LOG_FILE}" \
+         | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl"
+EXIT_CODE=$?
+echo "=================================================================="
+echo "  Done : $(date)"
+echo "  Exit code: ${EXIT_CODE}"
+if [[ "${EXIT_CODE}" -eq 0 ]]; then
+    echo "  모델 저장 위치: ${OUTPUT_DIR}"
+fi
+echo "=================================================================="
+exit $EXIT_CODE

source/scripts/launch_3b_pretrain.sh ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_3b_pretrain.sh — 8-GPU FP8 pretraining launcher for Korean 3B LLM
+#
+# Features:
+#   - SIGHUP 방어: SSH 끊김 시 자동으로 nohup+setsid로 세션 보호
+#   - Graceful shutdown: SIGTERM 시 Python 시그널 핸들러가 비상 체크포인트 저장
+#   - 자동 resume: 최신 체크포인트에서 자동 재개
+#   - PID 파일: 프로세스 모니터링 및 제어용
+#   - grep 파이프라인 exit code 보호 (|| true)
+#
+# Usage:
+#   bash scripts/launch_3b_pretrain.sh                          # full run (60B tokens)
+#   bash scripts/launch_3b_pretrain.sh --max_steps 500          # quick test
+#   bash scripts/launch_3b_pretrain.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-0010000
+#   MAX_STEPS=95000 bash scripts/launch_3b_pretrain.sh          # 100B tokens
+#
+# 모니터링:
+#   tail -f checkpoints/korean_3b_fp8_run1/train.log
+#   cat checkpoints/korean_3b_fp8_run1/train.pid
+#
+# 중지 (비상 체크포인트 자동 저장):
+#   kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)
+#
+# 강제 종료 (체크포인트 저장 없음):
+#   kill -9 $(cat checkpoints/korean_3b_fp8_run1/train.pid)
+# =============================================================================
+# -u: 미정의 변수 에러
+# NOTE: -e, -o pipefail 의도적 제거
+#   이전 문제: grep 파이프라인에서 모든 라인이 필터링되면 exit code 1 반환
+#   → pipefail이 이를 스크립트 실패로 전파 → 학습 중단
+#   해결: set -e/pipefail 제거 + grep 체인에 || true 추가
+set -u
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
+CONFIG="${CONFIG:-configs/korean_3b_fp8.yaml}"
+TRAIN_DATA="${TRAIN_DATA:-data/3b_train.bin}"
+VAL_DATA="${VAL_DATA:-data/3b_val.bin}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29501}"
+MAX_STEPS="${MAX_STEPS:-57000}"
+BATCH_SIZE=5
+GRAD_ACCUM=8
+WARMUP_STEPS=2000
+SEED=42
+# ---- B200 / NVSwitch single-node NCCL tuning (3B optimized, v2) ----------
+export NCCL_IB_DISABLE=1
+export NCCL_ALGO=NVLS,Ring              # NVSwitch hardware reduction first (was Ring,Tree)
+export NCCL_PROTO=Simple
+export NCCL_NVLS_ENABLE=1               # NVLink SHARP — hardware-accelerated all-reduce
+export NCCL_MIN_NCHANNELS=32            # raise minimum for NVSwitch headroom (was 16)
+export NCCL_MAX_NCHANNELS=32
+export NCCL_BUFFSIZE=268435456          # 256MB (was 128MB) — reduces bucket pipeline stalls
+export NCCL_P2P_LEVEL=NVL
+export NCCL_NET_GDR_LEVEL=0
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# Triton/Inductor cache on executable filesystem (not /tmp which is noexec)
+export TRITON_CUDACRT_PATH=/usr/local/cuda/include
+export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+cd "$(dirname "$0")/.."
+mkdir -p "${CKPT_DIR}"
+# ---- Session protection (SIGHUP 방어) ---------------------------------------
+# tmux/screen 없이 실행 시, 자동으로 nohup + setsid로 래핑하여
+# SSH 끊김(SIGHUP)으로부터 학습 프로세스를 보호합니다.
+#
+# 작동 원리:
+#   1. tmux/screen/이미 보호됨 여부 확인
+#   2. 미보호 상태이면 _LAUNCH_PROTECTED=1 설정 후 nohup setsid로 자기 자신을 재실행
+#   3. 재실행된 프로세스는 새로운 세션 리더가 되어 터미널과 분리됨
+#   4. 원래 셸은 PID와 모니터링 명령을 출력하고 즉시 종료
+PID_FILE="${CKPT_DIR}/train.pid"
+if [[ -z "${_LAUNCH_PROTECTED:-}" ]] && [[ -z "${TMUX:-}" ]] && [[ -z "${STY:-}" ]]; then
+    export _LAUNCH_PROTECTED=1
+    NOHUP_LOG="${CKPT_DIR}/launch_$(date +%Y%m%d_%H%M%S).log"
+    echo "=================================================================="
+    echo "  SIGHUP PROTECTION ACTIVATED"
+    echo "  tmux/screen 미감지 → 세션 보호 모드 자동 활성화 (nohup + setsid)"
+    echo "  SSH 끊어져도 학습이 계속됩니다."
+    echo "=================================================================="
+    echo ""
+    # 자기 자신을 세션 보호 모드로 재실행
+    nohup setsid bash "$0" "$@" > "${NOHUP_LOG}" 2>&1 &
+    BG_PID=$!
+    echo "${BG_PID}" > "${PID_FILE}"
+    echo "  PID         : ${BG_PID}"
+    echo "  PID 파일    : ${PID_FILE}"
+    echo "  Launch 로그 : ${NOHUP_LOG}"
+    echo "  학습 로그   : ${LOG_FILE}"
+    echo ""
+    echo "  모니터링:"
+    echo "    tail -f ${LOG_FILE}"
+    echo ""
+    echo "  중지 (비상 체크포인트 자동 저장):"
+    echo "    kill \$(cat ${PID_FILE})"
+    echo ""
+    echo "  강제 종료:"
+    echo "    kill -9 \$(cat ${PID_FILE})"
+    echo "=================================================================="
+    exit 0
+fi
+# ---- Cleanup on exit --------------------------------------------------------
+PREWARM_PID=""
+cleanup() {
+    rm -f "${PID_FILE}" 2>/dev/null || true
+    if [[ -n "${PREWARM_PID:-}" ]]; then
+        kill "${PREWARM_PID}" 2>/dev/null || true
+    fi
+}
+trap cleanup EXIT
+# PID 파일 기록 (tmux/screen 내에서 실행 시에도 PID 추적 가능)
+echo "$$" > "${PID_FILE}"
+# ---- Pre-flight checks ------------------------------------------------------
+if [[ ! -f "${CONFIG}" ]]; then
+    echo "[ERROR] Config not found: ${CONFIG}"
+    exit 1
+fi
+if [[ ! -f "${TRAIN_DATA}" ]]; then
+    echo "[ERROR] Training data not found: ${TRAIN_DATA}"
+    exit 1
+fi
+# GPU 메모리 체크 (3B는 최소 80GB/GPU 권장, B200=192GB → OK)
+GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
+if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 80000 ]]; then
+    echo "[WARN] GPU memory ${GPU_MEM}MB < 80GB. 3B 학습에 부족할 수 있음."
+fi
+# 중복 프로세스 방지
+EXISTING_PID=$(pgrep -f "pretrain.py.*korean_3b" 2>/dev/null | head -1 || true)
+if [[ -n "$EXISTING_PID" ]]; then
+    echo "[ERROR] 이미 3B pretrain 프로세스 실행 중 (PID: ${EXISTING_PID})"
+    echo "        kill ${EXISTING_PID} 로 먼저 종료하세요."
+    exit 1
+fi
+# 디스크 여유 확인 (최소 1TB 필요)
+AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}')
+if [[ -n "${AVAIL_KB:-}" ]] && [[ "$AVAIL_KB" -lt 1073741824 ]]; then
+    AVAIL_TB=$(echo "scale=1; $AVAIL_KB / 1073741824" | bc 2>/dev/null || echo "?")
+    echo "[WARN] /PROJECT 여유 ${AVAIL_TB}TB < 1TB. 체크포인트 저장 공간 부족 가능."
+fi
+# ---- Resume detection -------------------------------------------------------
+RESUME_ARG=""
+EXTRA_ARGS="${*:-}"
+if [[ ! "${EXTRA_ARGS}" =~ "--resume" ]]; then
+    # 가장 최근 체크포인트 자동 감지
+    LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
+    if [[ -n "$LATEST_CKPT" ]]; then
+        echo "[INFO] 자동 resume 감지: ${LATEST_CKPT}"
+        RESUME_ARG="--resume ${LATEST_CKPT}"
+    fi
+fi
+# ---- Banner ------------------------------------------------------------------
+SESSION_TYPE="direct"
+[[ -n "${TMUX:-}" ]] && SESSION_TYPE="tmux"
+[[ -n "${STY:-}" ]] && SESSION_TYPE="screen"
+[[ -n "${_LAUNCH_PROTECTED:-}" ]] && SESSION_TYPE="protected (nohup+setsid)"
+echo "=================================================================="
+echo "  Korean 3B LLM Pre-Training (FP8)"
+echo "  Run name    : ${RUN_NAME}"
+echo "  Config      : ${CONFIG}"
+echo "  CKPT dir    : ${CKPT_DIR}"
+echo "  Log file    : ${LOG_FILE}"
+echo "  Max steps   : ${MAX_STEPS}"
+echo "  Batch       : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} accum"
+echo "  Eff tokens  : $((BATCH_SIZE * NPROC * GRAD_ACCUM * 4096)) tokens/step (~1M)"
+echo "  Total tokens: ~$((MAX_STEPS * BATCH_SIZE * NPROC * GRAD_ACCUM * 4096 / 1000000000))B"
+echo "  Resume      : ${RESUME_ARG:-none (fresh start)}"
+echo "  Session     : ${SESSION_TYPE}"
+echo "  PID         : $$ (file: ${PID_FILE})"
+echo "  Started     : $(date)"
+echo "=================================================================="
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+# ---- Pre-warm OS page cache (NUMA-interleaved, non-blocking) ---------------
+if [[ -f "${TRAIN_DATA}" ]]; then
+    echo "[INFO] Pre-warming page cache for ${TRAIN_DATA} (NUMA interleaved)..."
+    numactl --interleave=all dd if="${TRAIN_DATA}" of=/dev/null bs=16M 2>/dev/null &
+    PREWARM_PID=$!
+fi
+# ---- Launch training ---------------------------------------------------------
+# grep 파이프라인 보호:
+#   문제: grep -v 가 매칭 라인이 없으면 exit code 1 반환
+#   해결: { ... || true; } 래핑으로 파이프라인 exit code를 항상 0으로 보장
+#   torchrun의 실제 exit code는 PIPESTATUS[0]으로 별도 캡처
+numactl --interleave=all \
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/pretrain.py \
+    --config "${CONFIG}" \
+    --train_data "${TRAIN_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --grad_accum ${GRAD_ACCUM} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    ${RESUME_ARG} \
+    ${EXTRA_ARGS} \
+    2>&1 | { grep -v "UserWarning" \
+           | grep -v "Warning only once" \
+           | grep -v "Overriding a previously" \
+           | grep -v "dispatch key:" \
+           | grep -v "previous kernel:" \
+           | grep -v "new kernel:" \
+           | grep -v "operator: flash_attn" \
+           | grep -v "registered at /usr/local" \
+           | grep -v "self.m.impl" \
+           || true; }
+EXIT_CODE=${PIPESTATUS[0]}
+# ---- Exit summary ------------------------------------------------------------
+echo ""
+echo "=================================================================="
+echo "  Finished  : $(date)"
+echo "  Exit code : ${EXIT_CODE}"
+if [[ ${EXIT_CODE} -eq 0 ]]; then
+    echo "  Status    : SUCCESS (학습 완료 또는 graceful shutdown)"
+elif [[ ${EXIT_CODE} -eq 143 ]]; then
+    echo "  Status    : TERMINATED (SIGTERM — 비상 체크포인트 저장됨)"
+elif [[ ${EXIT_CODE} -eq 137 ]]; then
+    echo "  Status    : KILLED (SIGKILL — 강제 종료, 체크포인트 미저장)"
+elif [[ ${EXIT_CODE} -eq 1 ]]; then
+    echo "  Status    : ERROR (${LOG_FILE} 확인 필요)"
+else
+    echo "  Status    : FAILED (exit code ${EXIT_CODE}, ${LOG_FILE} 확인)"
+fi
+echo "=================================================================="
+exit ${EXIT_CODE}

source/scripts/launch_3b_sft.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_3b_sft.sh — 8-GPU FP8 SFT launcher for 3B Korean LLM
+#
+# Usage:
+#   bash scripts/launch_3b_sft.sh
+#   bash scripts/launch_3b_sft.sh --max_steps 200    # quick test
+#   bash scripts/launch_3b_sft.sh --resume checkpoints/korean_3b_sft_v1/checkpoint-0002000
+#
+# Base model : checkpoints/korean_3b_fp8_run1/checkpoint-XXXXXX  (기본값)
+#              --base_checkpoint 인자로 덮어쓸 수 있음
+# SFT data   : data/sft_combined/train_filtered.jsonl
+#              (먼저 scripts/prepare_sft_combined.sh → data/filter_sft_v2.py 실행)
+#
+# Effective batch: 2 (local) × 8 GPU × 4 (grad_accum) = 64 samples/step
+# =============================================================================
+set -euo pipefail
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_3b_sft_v1}"
+CONFIG="${CONFIG:-configs/korean_3b_sft.yaml}"
+BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_3b_fp8_run1/checkpoint-0057000}"
+SFT_DATA="${SFT_DATA:-data/sft_combined/train_filtered.jsonl}"
+VAL_DATA="${VAL_DATA:-data/sft_combined/val_filtered.jsonl}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29503}"
+MAX_STEPS=33000
+BATCH_SIZE=2
+GRAD_ACCUM=4
+LR="1.0e-5"
+WARMUP_STEPS=500
+SEED=42
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
+export NCCL_IB_DISABLE=1
+export NCCL_ALGO=Ring
+export NCCL_PROTO=Simple
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_BUFFSIZE=67108864
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+# 3B 모델 VRAM 절약 — 동적 메모리 세그먼트 확장 허용
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+cd "$(dirname "$0")/.."
+# ---- Pre-flight checks ------------------------------------------------------
+if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: Base checkpoint 디렉토리를 찾을 수 없습니다."
+    echo "  경로: ${BASE_CHECKPOINT}"
+    echo ""
+    echo "  --base_checkpoint 인자로 실제 경로를 지정하거나"
+    echo "  BASE_CHECKPOINT 환경변수를 설정하세요."
+    echo "  예: bash scripts/launch_3b_sft.sh --base_checkpoint checkpoints/korean_3b_fp8_run1/checkpoint-0057000"
+    echo "=================================================================="
+    exit 1
+fi
+if [[ ! -f "${SFT_DATA}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: SFT 학습 데이터를 찾을 수 없습니다: ${SFT_DATA}"
+    echo ""
+    echo "  데이터 준비 순서:"
+    echo "    1. bash scripts/prepare_sft_combined.sh"
+    echo "    2. python data/filter_sft_v2.py \\"
+    echo "           --input  data/sft_combined/train.jsonl \\"
+    echo "           --output data/sft_combined/train_filtered.jsonl"
+    echo "=================================================================="
+    exit 1
+fi
+# val 파일 없으면 원본 val.jsonl 로 폴백
+if [[ ! -f "${VAL_DATA}" ]]; then
+    VAL_FALLBACK="data/sft_combined/val.jsonl"
+    if [[ -f "${VAL_FALLBACK}" ]]; then
+        VAL_DATA="${VAL_FALLBACK}"
+        echo "[INFO] val_filtered 없음, 폴백: ${VAL_DATA}"
+    else
+        echo "ERROR: VAL_DATA 파일을 찾을 수 없습니다: ${VAL_DATA}"
+        exit 1
+    fi
+fi
+mkdir -p "${CKPT_DIR}"
+echo "=================================================================="
+echo "  3B SFT Fine-Tuning"
+echo "  Run name        : ${RUN_NAME}"
+echo "  Config          : ${CONFIG}"
+echo "  Base checkpoint : ${BASE_CHECKPOINT}"
+echo "  SFT data        : ${SFT_DATA}"
+echo "  Val data        : ${VAL_DATA}"
+echo "  CKPT dir        : ${CKPT_DIR}"
+echo "  Log file        : ${LOG_FILE}"
+echo "  Max steps       : ${MAX_STEPS}"
+echo "  Batch size      : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum = $((BATCH_SIZE * NPROC * GRAD_ACCUM)) eff_batch"
+echo "  Learning rate   : ${LR}"
+echo "  Warmup          : ${WARMUP_STEPS} steps"
+echo "  Master port     : ${MASTER_PORT}"
+echo "  ALLOC_CONF      : ${PYTORCH_CUDA_ALLOC_CONF}"
+echo "  Started         : $(date)"
+echo "=================================================================="
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/sft.py \
+    --config "${CONFIG}" \
+    --base_checkpoint "${BASE_CHECKPOINT}" \
+    --sft_data "${SFT_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --grad_accum ${GRAD_ACCUM} \
+    --lr ${LR} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    --use_fp8 \
+    ${EXTRA_ARGS} \
+    2>&1 | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl" \
+         | tee -a "${LOG_FILE}"
+echo "=================================================================="
+echo "  3B SFT Done : $(date)"
+echo "=================================================================="

source/scripts/launch_3b_sft_v2.sh ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_3b_sft_v2.sh — 8-GPU FP8 SFT v2 launcher for 3B Korean LLM
+#
+# SFT v2 improvements over v1:
+#   - LR: 1e-5 → 5e-5 (5x, resolve underfitting)
+#   - Effective batch: 64 → 256 (4x)
+#   - Data mixing: 70% SFT + 30% pretrain (forgetting prevention)
+#   - Weight decay: 0.01 → 0.05
+#   - Warmup: 500 → 2000 steps
+#   - Max steps: 33000 → 15000
+#
+# Usage:
+#   bash scripts/launch_3b_sft_v2.sh
+#   bash scripts/launch_3b_sft_v2.sh --max_steps 200    # quick test
+#   bash scripts/launch_3b_sft_v2.sh --resume checkpoints/korean_3b_sft_v2/checkpoint-0002000
+#
+# Effective batch: 4 (local) x 8 GPU x 8 (grad_accum) = 256 samples/step
+# =============================================================================
+set -euo pipefail
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_3b_sft_v2}"
+CONFIG="${CONFIG:-configs/korean_3b_sft_v2.yaml}"
+BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_3b_fp8_run1/checkpoint-0057000}"
+SFT_DATA="${SFT_DATA:-data/sft_combined/train_filtered.jsonl}"
+VAL_DATA="${VAL_DATA:-data/sft_combined/val_filtered.jsonl}"
+PRETRAIN_DATA="${PRETRAIN_DATA:-data/3b_train.bin}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29504}"
+MAX_STEPS=15000
+BATCH_SIZE=4
+GRAD_ACCUM=8
+LR="5.0e-5"
+WARMUP_STEPS=2000
+WEIGHT_DECAY=0.05
+PRETRAIN_MIX_RATIO=0.3
+SEED=42
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
+export NCCL_IB_DISABLE=1
+export NCCL_ALGO=Ring
+export NCCL_PROTO=Simple
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_BUFFSIZE=67108864
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+# 3B + bs=4 VRAM allocation
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+cd "$(dirname "$0")/.."
+# ---- Pre-flight checks ------------------------------------------------------
+if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: Base checkpoint not found: ${BASE_CHECKPOINT}"
+    echo "  Set BASE_CHECKPOINT env var or use --base_checkpoint CLI arg."
+    echo "=================================================================="
+    exit 1
+fi
+if [[ ! -f "${SFT_DATA}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: SFT data not found: ${SFT_DATA}"
+    echo "  Run: bash scripts/prepare_sft_combined.sh"
+    echo "=================================================================="
+    exit 1
+fi
+if [[ ! -f "${PRETRAIN_DATA}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: Pretrain data not found: ${PRETRAIN_DATA}"
+    echo "  Set PRETRAIN_DATA env var to the correct path."
+    echo "=================================================================="
+    exit 1
+fi
+# val fallback
+if [[ ! -f "${VAL_DATA}" ]]; then
+    VAL_FALLBACK="data/sft_combined/val.jsonl"
+    if [[ -f "${VAL_FALLBACK}" ]]; then
+        VAL_DATA="${VAL_FALLBACK}"
+        echo "[INFO] val_filtered not found, fallback: ${VAL_DATA}"
+    else
+        echo "ERROR: VAL_DATA not found: ${VAL_DATA}"
+        exit 1
+    fi
+fi
+mkdir -p "${CKPT_DIR}"
+echo "=================================================================="
+echo "  3B SFT v2 Fine-Tuning"
+echo "  Run name        : ${RUN_NAME}"
+echo "  Config          : ${CONFIG}"
+echo "  Base checkpoint : ${BASE_CHECKPOINT}"
+echo "  SFT data        : ${SFT_DATA}"
+echo "  Pretrain data   : ${PRETRAIN_DATA}"
+echo "  Val data        : ${VAL_DATA}"
+echo "  CKPT dir        : ${CKPT_DIR}"
+echo "  Log file        : ${LOG_FILE}"
+echo "  Max steps       : ${MAX_STEPS}"
+echo "  Batch size      : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} grad_accum = $((BATCH_SIZE * NPROC * GRAD_ACCUM)) eff_batch"
+echo "  Learning rate   : ${LR}"
+echo "  Weight decay    : ${WEIGHT_DECAY}"
+echo "  Warmup          : ${WARMUP_STEPS} steps"
+echo "  Data mixing     : $((100 - ${PRETRAIN_MIX_RATIO%.*}0))% SFT + ${PRETRAIN_MIX_RATIO}00% pretrain"
+echo "  Master port     : ${MASTER_PORT}"
+echo "  ALLOC_CONF      : ${PYTORCH_CUDA_ALLOC_CONF}"
+echo "  Started         : $(date)"
+echo "=================================================================="
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/sft.py \
+    --config "${CONFIG}" \
+    --base_checkpoint "${BASE_CHECKPOINT}" \
+    --sft_data "${SFT_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --pretrain_data "${PRETRAIN_DATA}" \
+    --pretrain_mix_ratio ${PRETRAIN_MIX_RATIO} \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --grad_accum ${GRAD_ACCUM} \
+    --lr ${LR} \
+    --weight_decay ${WEIGHT_DECAY} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    --use_fp8 \
+    ${EXTRA_ARGS} \
+    2>&1 | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl" \
+         | tee -a "${LOG_FILE}"
+echo "=================================================================="
+echo "  3B SFT v2 Done : $(date)"
+echo "=================================================================="

source/scripts/launch_fp8.sh ADDED Viewed

	@@ -0,0 +1,94 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_fp8.sh — 8-GPU FP8 pretraining launcher for B200
+#
+# Usage:
+#   bash scripts/launch_fp8.sh                    # full run
+#   bash scripts/launch_fp8.sh --max_steps 500    # quick test
+#   bash scripts/launch_fp8.sh --resume checkpoints/small_fp8_run1/checkpoint-0001000
+#
+# Config is read from configs/small_fp8.yaml (model) + CLI args (train).
+# Logs: checkpoints/<RUN_NAME>/train.log
+#       checkpoints/<RUN_NAME>/tensorboard/
+# =============================================================================
+set -euo pipefail
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-small_fp8_run1}"
+CONFIG="${CONFIG:-configs/small_fp8.yaml}"
+TRAIN_DATA="${TRAIN_DATA:-data/train.bin}"
+VAL_DATA="${VAL_DATA:-data/val.bin}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29500}"
+# ---- Defaults that can be overridden via extra CLI args --------------------
+MAX_STEPS=100000
+BATCH_SIZE=8
+GRAD_ACCUM=4
+WARMUP_STEPS=2000
+SEED=42
+# ---- Pass remaining CLI args directly to pretrain.py ----------------------
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
+# Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
+export NCCL_IB_DISABLE=1
+# Use Ring algorithm for large gradient tensors (128M-70B model range).
+export NCCL_ALGO=Ring
+# Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
+export NCCL_PROTO=Simple
+# More channels → better NVSwitch saturation for large all-reduce payloads.
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+# Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
+export NCCL_BUFFSIZE=67108864
+# CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+# ---- Setup ------------------------------------------------------------------
+mkdir -p "${CKPT_DIR}"
+cd "$(dirname "$0")/.."  # always run from project root
+echo "=================================================================="
+echo "  Run name  : ${RUN_NAME}"
+echo "  Config    : ${CONFIG}"
+echo "  CKPT dir  : ${CKPT_DIR}"
+echo "  Log file  : ${LOG_FILE}"
+echo "  Started   : $(date)"
+echo "=================================================================="
+# Suppress the harmless flash_attn kernel override warning from all ranks.
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/pretrain.py \
+    --config "${CONFIG}" \
+    --train_data "${TRAIN_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --grad_accum ${GRAD_ACCUM} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    ${EXTRA_ARGS} \
+    2>&1 | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl"
+echo "=================================================================="
+echo "  Done : $(date)"
+echo "=================================================================="

source/scripts/launch_hybrid_3b.sh ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/bin/bash
+# ============================================================================
+# FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer 학습 런치 스크립트
+# ============================================================================
+#
+# 사용법:
+#   nohup setsid bash scripts/launch_hybrid_3b.sh > logs/hybrid_3b.log 2>&1 &
+#
+# SIGHUP 방어: nohup + setsid 조합으로 SSH 끊김에도 학습 유지
+# ============================================================================
+set -euo pipefail
+# ---- 환경 변수 ----
+export OMP_NUM_THREADS=4
+export NCCL_ALGO=NVLS           # NVSwitch 최적 알고리즘
+export NCCL_IB_DISABLE=1        # InfiniBand 비활성 (단일 노드)
+export NCCL_P2P_LEVEL=NVL       # NVLink P2P
+export NCCL_NET_GDR_LEVEL=0     # GPU Direct RDMA 비활성 (단일 노드)
+# ---- 경로 ----
+PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
+CONFIG="${PROJECT_ROOT}/configs/hybrid_3b.yaml"
+TRAIN_DATA="${PROJECT_ROOT}/data/3b_train.bin"
+VAL_DATA="${PROJECT_ROOT}/data/3b_val.bin"
+CKPT_DIR="${PROJECT_ROOT}/checkpoints/hybrid_3b_run1"
+LOG_FILE="${PROJECT_ROOT}/logs/hybrid_3b_train.log"
+# ---- 디렉토리 생성 ----
+mkdir -p "${CKPT_DIR}"
+mkdir -p "$(dirname ${LOG_FILE})"
+cd "${PROJECT_ROOT}"
+echo "============================================"
+echo "  FRANKENSTALLM-H 3B Hybrid Training"
+echo "  Config: ${CONFIG}"
+echo "  Data: ${TRAIN_DATA}"
+echo "  Checkpoint: ${CKPT_DIR}"
+echo "  Started: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "============================================"
+# ---- 학습 실행 (8 GPU DDP) ----
+torchrun \
+    --nproc_per_node=8 \
+    --master_port=29500 \
+    train/pretrain.py \
+    --config "${CONFIG}" \
+    --train_data "${TRAIN_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --batch_size 4 \
+    --lr 2e-4 \
+    --weight_decay 0.1 \
+    --warmup_steps 2000 \
+    --grad_accum 8 \
+    --max_steps 57000 \
+    --log_file "${LOG_FILE}" \
+    --use_fp8 \
+    "$@"
+echo "Training finished at $(date '+%Y-%m-%d %H:%M:%S')"

source/scripts/launch_korean_1b.sh ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_korean_1b.sh — 8-GPU FP8 pretraining launcher for 1B Korean LLM
+#
+# Usage:
+#   bash scripts/launch_korean_1b.sh                     # full run
+#   bash scripts/launch_korean_1b.sh --max_steps 500     # quick test
+#   bash scripts/launch_korean_1b.sh --resume checkpoints/korean_1b_fp8_run1/checkpoint-0010000
+#
+# Config is read from configs/korean_1b_fp8.yaml (model) + CLI args (train).
+# Effective batch size: 8 (local) × 8 GPU × 4 (grad_accum) × 4096 (seq_len)
+#                     = 1,048,576 tokens / step
+# Logs: checkpoints/<RUN_NAME>/train.log
+#       checkpoints/<RUN_NAME>/tensorboard/
+# =============================================================================
+set -euo pipefail
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_1b_fp8_run1}"
+CONFIG="${CONFIG:-configs/korean_1b_fp8.yaml}"
+TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
+VAL_DATA="${VAL_DATA:-data/korean_val.bin}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29501}"
+# ---- Defaults that can be overridden via extra CLI args --------------------
+MAX_STEPS=34000     # 4 에포크 × 8.91B tokens = 35.6B (Muennighoff 2023: 4에포크 초과 시 val loss 상승)
+BATCH_SIZE=8
+GRAD_ACCUM=4
+WARMUP_STEPS=2000   # 34k steps의 5.9% (기존 4000 = 11.8%로 과도)
+SEED=42
+# ---- Pass remaining CLI args directly to pretrain.py ----------------------
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
+# Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
+export NCCL_IB_DISABLE=1
+# Use Ring algorithm for large gradient tensors (128M-70B model range).
+export NCCL_ALGO=Ring
+# Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
+export NCCL_PROTO=Simple
+# More channels → better NVSwitch saturation for large all-reduce payloads.
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+# Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
+export NCCL_BUFFSIZE=67108864
+# CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+# ---- Setup ------------------------------------------------------------------
+cd "$(dirname "$0")/.."  # always run from project root
+# ---- Pre-flight check: Korean data must exist before launching --------------
+if [[ ! -f "${TRAIN_DATA}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: Training data not found: ${TRAIN_DATA}"
+    echo ""
+    echo "  You need to run the Korean data pipeline first."
+    echo "  Example steps:"
+    echo "    1. Download / prepare raw Korean corpus"
+    echo "    2. Tokenise and pack into binary format:"
+    echo "         python data/prepare_korean.py --output data/korean_train.bin"
+    echo "    3. Re-run this script once the file exists."
+    echo "=================================================================="
+    exit 1
+fi
+if [[ ! -f "${VAL_DATA}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: Validation data not found: ${VAL_DATA}"
+    echo ""
+    echo "  You need to run the Korean data pipeline first."
+    echo "  Example steps:"
+    echo "    1. Download / prepare raw Korean corpus"
+    echo "    2. Tokenise and pack into binary format (val split):"
+    echo "         python data/prepare_korean.py --output_val data/korean_val.bin"
+    echo "    3. Re-run this script once the file exists."
+    echo "=================================================================="
+    exit 1
+fi
+mkdir -p "${CKPT_DIR}"
+echo "=================================================================="
+echo "  Run name    : ${RUN_NAME}"
+echo "  Config      : ${CONFIG}"
+echo "  Train data  : ${TRAIN_DATA}"
+echo "  Val data    : ${VAL_DATA}"
+echo "  CKPT dir    : ${CKPT_DIR}"
+echo "  Log file    : ${LOG_FILE}"
+echo "  Max steps   : ${MAX_STEPS}"
+echo "  Batch size  : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
+echo "  Warmup      : ${WARMUP_STEPS} steps"
+echo "  Master port : ${MASTER_PORT}"
+echo "  Started     : $(date)"
+echo "=================================================================="
+# Suppress the harmless flash_attn kernel override warning from all ranks.
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/pretrain.py \
+    --config "${CONFIG}" \
+    --train_data "${TRAIN_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --grad_accum ${GRAD_ACCUM} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    ${EXTRA_ARGS} \
+    2>&1 | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl" \
+         | tee -a "${LOG_FILE}"
+echo "=================================================================="
+echo "  Done : $(date)"
+echo "=================================================================="

source/scripts/launch_korean_3b.sh ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_korean_3b.sh — 8-GPU FP8 pretraining launcher for 3B Korean LLM
+#
+# Usage:
+#   bash scripts/launch_korean_3b.sh                     # full run (~60B tokens)
+#   bash scripts/launch_korean_3b.sh --max_steps 50      # quick benchmark
+#   bash scripts/launch_korean_3b.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
+#
+# Effective batch size: 8 (local) × 8 GPU × 4 (grad_accum) × 4096 (seq_len)
+#                     = 1,048,576 tokens / step
+# =============================================================================
+set -euo pipefail
+RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
+CONFIG="${CONFIG:-configs/3b_pretrain.yaml}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29502}"
+MAX_STEPS=57000
+BATCH_SIZE=4
+GRAD_ACCUM=8
+LR=1.5e-4
+WARMUP_STEPS=2000
+SEED=42
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch NCCL tuning -------------------------------------------
+export NCCL_IB_DISABLE=1
+export NCCL_ALGO=Ring
+export NCCL_PROTO=Simple
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_BUFFSIZE=67108864
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+# cd FIRST — 이후 상대경로 체크가 프로젝트 루트 기준으로 동작
+cd "$(dirname "$0")/.."
+# TRAIN_DATA fallback: cd 이후에 상대경로 체크
+if [[ -f "data/merged_3b_train.bin" ]]; then
+    TRAIN_DATA="${TRAIN_DATA:-data/merged_3b_train.bin}"
+    echo "Using merged training data: data/merged_3b_train.bin"
+elif [[ -f "data/korean_train.bin" ]]; then
+    TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
+    echo "Using fallback training data: data/korean_train.bin"
+else
+    echo "ERROR: No training data found (data/merged_3b_train.bin or data/korean_train.bin)"
+    exit 1
+fi
+# VAL_DATA fallback: cd 이후에 상대경로 체크
+VAL_DATA="${VAL_DATA:-data/merged_3b_val.bin}"
+if [[ ! -f "${VAL_DATA}" ]]; then
+    VAL_DATA="data/korean_val.bin"
+fi
+if [[ ! -f "${TRAIN_DATA}" ]]; then
+    echo "ERROR: Training data not found: ${TRAIN_DATA}"
+    exit 1
+fi
+if [[ ! -f "${VAL_DATA}" ]]; then
+    echo "ERROR: Validation data not found: ${VAL_DATA}"
+    exit 1
+fi
+mkdir -p "${CKPT_DIR}"
+echo "=================================================================="
+echo "  Run name    : ${RUN_NAME}"
+echo "  Config      : ${CONFIG}"
+echo "  Train data  : ${TRAIN_DATA}"
+echo "  CKPT dir    : ${CKPT_DIR}"
+echo "  Max steps   : ${MAX_STEPS}"
+echo "  LR          : ${LR}"
+echo "  Batch size  : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
+echo "  Started     : $(date)"
+echo "=================================================================="
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/pretrain.py \
+    --config "${CONFIG}" \
+    --train_data "${TRAIN_DATA}" \
+    --val_data "${VAL_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --lr ${LR} \
+    --grad_accum ${GRAD_ACCUM} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    ${EXTRA_ARGS} \
+    2>&1 | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl" \
+         | tee -a "${LOG_FILE}"
+echo "=================================================================="
+echo "  Done : $(date)"
+echo "=================================================================="

source/scripts/launch_sft.sh ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env bash
+# =============================================================================
+# launch_sft.sh — 8-GPU FP8 SFT launcher for 1B Korean LLM
+#
+# Usage:
+#   bash scripts/launch_sft.sh
+#   bash scripts/launch_sft.sh --max_steps 500    # quick test
+#   bash scripts/launch_sft.sh --resume checkpoints/korean_1b_sft/checkpoint-0001000
+#
+# Base model: checkpoints/korean_1b_fp8_run1/checkpoint-0034000
+# SFT data:   data/sft/train.jsonl
+# =============================================================================
+set -euo pipefail
+# ---- Configurable defaults --------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_1b_sft}"
+BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_1b_fp8_run1/checkpoint-0034000}"
+SFT_DATA="${SFT_DATA:-data/sft/train.jsonl}"
+VAL_DATA="${VAL_DATA:-data/sft/val.jsonl}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+LOG_FILE="${CKPT_DIR}/train.log"
+NPROC=8
+MASTER_PORT="${MASTER_PORT:-29502}"
+MAX_STEPS=9000
+BATCH_SIZE=4
+GRAD_ACCUM=2
+LR="2.0e-5"
+WARMUP_STEPS=300
+SEED=42
+EXTRA_ARGS="$@"
+# ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
+export NCCL_IB_DISABLE=1
+export NCCL_ALGO=Ring
+export NCCL_PROTO=Simple
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16
+export NCCL_BUFFSIZE=67108864
+export OMP_NUM_THREADS=4
+export MKL_NUM_THREADS=4
+cd "$(dirname "$0")/.."
+# ---- Pre-flight checks ------------------------------------------------------
+if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
+    echo "ERROR: Base checkpoint not found: ${BASE_CHECKPOINT}"
+    exit 1
+fi
+if [[ ! -f "${SFT_DATA}" ]]; then
+    echo "=================================================================="
+    echo "  ERROR: SFT training data not found: ${SFT_DATA}"
+    echo ""
+    echo "  Run the data preparation script first:"
+    echo "    python data/prepare_sft_data.py"
+    echo "=================================================================="
+    exit 1
+fi
+mkdir -p "${CKPT_DIR}"
+echo "=================================================================="
+echo "  SFT Fine-Tuning"
+echo "  Run name       : ${RUN_NAME}"
+echo "  Base checkpoint : ${BASE_CHECKPOINT}"
+echo "  SFT data       : ${SFT_DATA}"
+echo "  CKPT dir       : ${CKPT_DIR}"
+echo "  Log file       : ${LOG_FILE}"
+echo "  Max steps      : ${MAX_STEPS}"
+echo "  Batch size     : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
+echo "  Learning rate  : ${LR}"
+echo "  Warmup         : ${WARMUP_STEPS} steps"
+echo "  Master port    : ${MASTER_PORT}"
+echo "  Started        : $(date)"
+echo "=================================================================="
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+torchrun \
+    --nproc_per_node=${NPROC} \
+    --master_port=${MASTER_PORT} \
+    train/sft.py \
+    --base_checkpoint "${BASE_CHECKPOINT}" \
+    --sft_data "${SFT_DATA}" \
+    --checkpoint_dir "${CKPT_DIR}" \
+    --log_file "${LOG_FILE}" \
+    --max_steps ${MAX_STEPS} \
+    --batch_size ${BATCH_SIZE} \
+    --grad_accum ${GRAD_ACCUM} \
+    --lr ${LR} \
+    --warmup_steps ${WARMUP_STEPS} \
+    --seed ${SEED} \
+    --use_fp8 \
+    --val_data "${VAL_DATA}" \
+    ${EXTRA_ARGS} \
+    2>&1 | grep -v "UserWarning" \
+         | grep -v "Warning only once" \
+         | grep -v "Overriding a previously" \
+         | grep -v "dispatch key:" \
+         | grep -v "previous kernel:" \
+         | grep -v "new kernel:" \
+         | grep -v "operator: flash_attn" \
+         | grep -v "registered at /usr/local" \
+         | grep -v "self.m.impl" \
+         | tee -a "${LOG_FILE}"
+echo "=================================================================="
+echo "  SFT Done : $(date)"
+echo "=================================================================="

source/scripts/migrate_qkv_checkpoint.py ADDED Viewed

	@@ -0,0 +1,230 @@

+#!/usr/bin/env python3
+"""Migrate checkpoint from separate Q/K/V projections to fused QKV.
+Usage:
+    python3 scripts/migrate_qkv_checkpoint.py <checkpoint_dir>
+Migrates both model.pt AND optimizer.pt:
+  - model.pt:     q_proj/k_proj/v_proj weights → qkv_proj weight
+  - optimizer.pt: exp_avg/exp_avg_sq states fused, param indices re-mapped
+The concatenation order is [Q ; K ; V] along the output (dim-0) axis,
+which matches the split in MultiHeadAttention.forward:
+    q, k, v = qkv.split([_q_dim, _kv_dim, _kv_dim], dim=-1)
+Optimizer layout (group 0 = weight_decay, per layer × 28):
+  [i*6+0] q_proj.weight  [3072, 3072]
+  [i*6+1] k_proj.weight  [1024, 3072]
+  [i*6+2] v_proj.weight  [1024, 3072]
+  [i*6+3] out_proj.weight [3072, 3072]
+  [i*6+4] fc1_weight     [16384, 3072]
+  [i*6+5] fc2_weight     [3072, 8192]
+After fusion: indices 0,1,2 → single qkv_proj → 4 params per layer.
+"""
+import sys
+import torch
+from pathlib import Path
+N_LAYERS = 28
+OLD_PARAMS_PER_LAYER = 6  # q, k, v, out, fc1, fc2
+NEW_PARAMS_PER_LAYER = 4  # qkv, out, fc1, fc2
+def migrate_model(state: dict) -> dict:
+    """Fuse Q/K/V projection weights into QKV in model state dict."""
+    new_state: dict = {}
+    layers_done: set = set()
+    for key, val in state.items():
+        if ".q_proj." not in key and ".k_proj." not in key and ".v_proj." not in key:
+            new_state[key] = val
+            continue
+        if ".q_proj." not in key:
+            continue
+        prefix = key.rsplit(".", 2)[0]
+        suffix = key.rsplit(".", 1)[-1]
+        tag = (prefix, suffix)
+        if tag in layers_done:
+            continue
+        layers_done.add(tag)
+        q_key = f"{prefix}.q_proj.{suffix}"
+        k_key = f"{prefix}.k_proj.{suffix}"
+        v_key = f"{prefix}.v_proj.{suffix}"
+        missing = [k for k in (q_key, k_key, v_key) if k not in state]
+        if missing:
+            raise KeyError(f"Expected keys not found in checkpoint: {missing}")
+        q_w, k_w, v_w = state[q_key], state[k_key], state[v_key]
+        fused = torch.cat([q_w, k_w, v_w], dim=0)
+        fused_key = f"{prefix}.qkv_proj.{suffix}"
+        new_state[fused_key] = fused
+        print(f"  Fused  {fused_key}: {list(fused.shape)}"
+              f"  (q={list(q_w.shape)}, k={list(k_w.shape)}, v={list(v_w.shape)})")
+    leaked = [k for k in new_state if ".q_proj." in k or ".k_proj." in k or ".v_proj." in k]
+    if leaked:
+        raise RuntimeError(f"BUG: old projection keys still present: {leaked}")
+    return new_state
+def migrate_optimizer(opt_state: dict) -> dict:
+    """Fuse optimizer states for Q/K/V → QKV and re-index parameters.
+    The optimizer has 2 param groups:
+      Group 0 (weight_decay): 168 = 28 layers × 6 (q,k,v,out,fc1,fc2)
+      Group 1 (no weight_decay): 58 = norms + embedding
+    We fuse q,k,v entries in group 0 (indices i*6+0,1,2 → one entry per layer).
+    Group 0 shrinks from 168 to 112 (28 layers × 4 params).
+    Group 1 stays at 58. Total: 170.
+    """
+    old_state = opt_state["state"]
+    old_groups = opt_state["param_groups"]
+    group0_count = len(old_groups[0]["params"])
+    expected_g0 = N_LAYERS * OLD_PARAMS_PER_LAYER
+    if group0_count != expected_g0:
+        raise ValueError(
+            f"Group 0 has {group0_count} params, expected {expected_g0}. "
+            f"Cannot auto-detect QKV layout."
+        )
+    # Validate shapes for first layer
+    shapes = []
+    for j in range(OLD_PARAMS_PER_LAYER):
+        idx = old_groups[0]["params"][j]
+        shapes.append(list(old_state[idx]["exp_avg"].shape))
+    expected_shapes = [[3072, 3072], [1024, 3072], [1024, 3072],
+                       [3072, 3072], [16384, 3072], [3072, 8192]]
+    if shapes != expected_shapes:
+        raise ValueError(
+            f"Layer 0 shapes {shapes} don't match expected {expected_shapes}. "
+            f"Cannot auto-detect QKV layout."
+        )
+    print(f"  Shape validation passed for layer 0.")
+    new_state_entries = {}
+    new_idx = 0
+    # --- Group 0: fuse q/k/v per layer ---
+    for layer_i in range(N_LAYERS):
+        base = layer_i * OLD_PARAMS_PER_LAYER
+        q_opt_idx = old_groups[0]["params"][base + 0]
+        k_opt_idx = old_groups[0]["params"][base + 1]
+        v_opt_idx = old_groups[0]["params"][base + 2]
+        q_entry = old_state[q_opt_idx]
+        k_entry = old_state[k_opt_idx]
+        v_entry = old_state[v_opt_idx]
+        # Fuse QKV
+        fused_entry = {"step": q_entry["step"]}
+        for field in ["exp_avg", "exp_avg_sq"]:
+            if field in q_entry:
+                fused_entry[field] = torch.cat(
+                    [q_entry[field], k_entry[field], v_entry[field]], dim=0
+                )
+        new_state_entries[new_idx] = fused_entry
+        if layer_i == 0:
+            print(f"  Layer 0 QKV fused: exp_avg {list(fused_entry['exp_avg'].shape)}")
+        new_idx += 1
+        # Copy remaining params (out, fc1, fc2)
+        for offset in [3, 4, 5]:
+            opt_idx = old_groups[0]["params"][base + offset]
+            new_state_entries[new_idx] = old_state[opt_idx]
+            new_idx += 1
+    new_group0_count = new_idx  # should be N_LAYERS * NEW_PARAMS_PER_LAYER = 112
+    print(f"  Group 0: {group0_count} → {new_group0_count} params")
+    # --- Group 1: copy as-is (norms, embedding — no QKV) ---
+    group1_count = len(old_groups[1]["params"])
+    for j in range(group1_count):
+        opt_idx = old_groups[1]["params"][j]
+        if opt_idx in old_state:
+            new_state_entries[new_idx] = old_state[opt_idx]
+        new_idx += 1
+    print(f"  Group 1: {group1_count} → {group1_count} params (unchanged)")
+    # Build new param_groups
+    new_groups = []
+    g0 = {k: v for k, v in old_groups[0].items() if k != "params"}
+    g0["params"] = list(range(0, new_group0_count))
+    new_groups.append(g0)
+    g1 = {k: v for k, v in old_groups[1].items() if k != "params"}
+    g1["params"] = list(range(new_group0_count, new_group0_count + group1_count))
+    new_groups.append(g1)
+    total = new_group0_count + group1_count
+    print(f"  Total: {len(old_state)} → {total} optimizer params")
+    return {"state": new_state_entries, "param_groups": new_groups}
+def migrate(ckpt_dir: Path) -> None:
+    model_path = ckpt_dir / "model.pt"
+    opt_path = ckpt_dir / "optimizer.pt"
+    if not model_path.exists():
+        raise FileNotFoundError(f"model.pt not found in {ckpt_dir}")
+    # --- Model migration ---
+    print(f"[1/2] Migrating model weights from {model_path} ...")
+    state = torch.load(model_path, map_location="cpu", weights_only=True)
+    has_old = any(".q_proj." in k for k in state)
+    has_new = any(".qkv_proj." in k for k in state)
+    if has_new and not has_old:
+        print("  Model already migrated. Skipping.")
+    elif has_old:
+        new_model_state = migrate_model(state)
+        torch.save(new_model_state, model_path)
+        print(f"  Model saved.")
+    else:
+        raise RuntimeError("Model state has neither q_proj nor qkv_proj keys!")
+    # --- Optimizer migration ---
+    if opt_path.exists():
+        print(f"\n[2/2] Migrating optimizer states from {opt_path} ...")
+        opt = torch.load(opt_path, map_location="cpu", weights_only=True)
+        # Check if already migrated
+        total_params = sum(len(pg["params"]) for pg in opt["param_groups"])
+        expected_old = N_LAYERS * OLD_PARAMS_PER_LAYER + 58  # 168 + 58 = 226
+        expected_new = N_LAYERS * NEW_PARAMS_PER_LAYER + 58  # 112 + 58 = 170
+        if total_params == expected_old:
+            opt_backup = ckpt_dir / "optimizer.pt.backup_pre_qkv"
+            if not opt_backup.exists():
+                torch.save(opt, opt_backup)
+                print(f"  Backup: {opt_backup}")
+            new_opt = migrate_optimizer(opt)
+            torch.save(new_opt, opt_path)
+            print(f"  Optimizer saved.")
+        elif total_params == expected_new:
+            print(f"  Optimizer already migrated ({total_params} params). Skipping.")
+        else:
+            print(f"  [WARN] Unexpected param count {total_params} "
+                  f"(expected old={expected_old} or new={expected_new}). "
+                  f"Deleting optimizer.pt — optimizer will restart fresh.")
+            opt_path.unlink()
+    else:
+        print("\n[2/2] No optimizer.pt found. Optimizer will restart fresh.")
+    print("\nMigration complete!")
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print(__doc__)
+        sys.exit(1)
+    migrate(Path(sys.argv[1]))

source/scripts/monitor_3b.sh ADDED Viewed

	@@ -0,0 +1,316 @@

+#!/usr/bin/env bash
+# =============================================================================
+# monitor_3b.sh — 3B 학습 실시간 모니터링 + 이상 감지 + 자동 체크포인트 정리
+#
+# Usage:
+#   bash scripts/monitor_3b.sh                          # 기본 감시
+#   bash scripts/monitor_3b.sh --check-once             # 1회 검사
+#   bash scripts/monitor_3b.sh --auto-cleanup           # 자동 오래된 체크포인트 삭제
+#
+# 3B 특화 사항:
+#   - 체크포인트 27GB/개 → 디스크 감시 강화
+#   - NCCL hang 감지 + 자동 재시작 옵션
+#   - 예상 완료 시간 실시간 계산
+#   - 프로세스 중복 실행 방지
+# =============================================================================
+set -euo pipefail
+# ---- Configuration ----------------------------------------------------------
+RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
+LOG_FILE="${1:-checkpoints/${RUN_NAME}/train.log}"
+CKPT_DIR="checkpoints/${RUN_NAME}"
+CHECK_INTERVAL=60          # 3B는 step 간격 더 김 → 60초
+ZERO_LOSS_THRESHOLD=3
+GNORM_WARN=10.0
+GNORM_CRITICAL=50.0
+LOSS_SPIKE_FACTOR=3.0
+STALL_TIMEOUT=600           # 10분 (3B는 step 더 오래 걸림)
+DISK_WARN_PCT=85
+DISK_CRITICAL_PCT=92
+GPU_UTIL_WARN=50
+MAX_CHECKPOINTS=15          # 최대 보관 체크포인트 수 (15 × 27GB = 405GB)
+CHECK_ONCE=false
+AUTO_CLEANUP=false
+AUTO_RESTART=false
+# Parse args
+for arg in "$@"; do
+    case "$arg" in
+        --check-once)   CHECK_ONCE=true ;;
+        --auto-cleanup) AUTO_CLEANUP=true ;;
+        --auto-restart) AUTO_RESTART=true ;;
+    esac
+done
+# Fix LOG_FILE if first arg was a flag
+if [[ "$LOG_FILE" == --* ]]; then
+    LOG_FILE="checkpoints/${RUN_NAME}/train.log"
+fi
+# ---- Colors -----------------------------------------------------------------
+RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m'
+CYAN='\033[0;36m'; MAGENTA='\033[0;35m'; NC='\033[0m'
+timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
+alert() {
+    local level="$1" msg="$2"
+    case "$level" in
+        CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
+        WARNING)  echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING]  ${msg}${NC}" ;;
+        INFO)     echo -e "${CYAN}🟡 [$(timestamp)] [INFO]     ${msg}${NC}" ;;
+        OK)       echo -e "${GREEN}✅ [$(timestamp)] [OK]       ${msg}${NC}" ;;
+    esac
+}
+# ---- Parse metrics ----------------------------------------------------------
+parse_metrics() {
+    local n="${1:-20}"
+    [[ -f "$LOG_FILE" ]] || return
+    tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
+}
+extract_field() {
+    echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
+}
+extract_step() {
+    echo "$1" | grep -oP "step\s+\K[0-9]+" | head -1
+}
+# ---- Check: Loss = 0 -------------------------------------------------------
+check_loss_zero() {
+    local lines
+    lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
+    [[ -z "$lines" ]] && return 0
+    local zero_count=0
+    while IFS= read -r line; do
+        local loss=$(extract_field "$line" "loss")
+        if [[ -n "$loss" ]] && (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
+            ((zero_count++))
+        fi
+    done <<< "$lines"
+    if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
+        alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그. 즉시 중단!"
+        return 1
+    fi
+}
+# ---- Check: Loss spike -----------------------------------------------------
+check_loss_spike() {
+    local lines=$(parse_metrics 20)
+    [[ -z "$lines" ]] && return 0
+    local losses=()
+    while IFS= read -r line; do
+        local loss=$(extract_field "$line" "loss")
+        [[ -n "$loss" ]] && losses+=("$loss")
+    done <<< "$lines"
+    local count=${#losses[@]}
+    [[ $count -lt 5 ]] && return 0
+    local last="${losses[$((count-1))]}"
+    local sum=0
+    for ((i=0; i<count-1; i++)); do
+        sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
+    done
+    local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
+    if [[ "$avg" != "0" ]]; then
+        local ratio=$(echo "$last / $avg" | bc -l 2>/dev/null || echo "1")
+        if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
+            alert WARNING "Loss spike! 현재=${last}, 평균=${avg}, 비율=${ratio}x"
+        fi
+    fi
+}
+# ---- Check: Gradient norm ---------------------------------------------------
+check_gnorm() {
+    local lines=$(parse_metrics 5)
+    [[ -z "$lines" ]] && return 0
+    local gnorm=$(extract_field "$(echo "$lines" | tail -1)" "gnorm")
+    [[ -z "$gnorm" ]] && return 0
+    if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
+        alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전."
+    elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
+        alert WARNING "GNorm=${gnorm} 불안정."
+    fi
+}
+# ---- Check: Stall / NCCL hang ----------------------------------------------
+check_stall() {
+    [[ ! -f "$LOG_FILE" ]] && return 0
+    local last_mod=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
+    local now=$(date +%s)
+    local diff=$((now - last_mod))
+    if [[ $diff -gt $STALL_TIMEOUT ]]; then
+        alert CRITICAL "로그 ${diff}초 ($(( diff/60 ))분) 멈춤! NCCL hang 가능성."
+        # NCCL hang 자동 재시작
+        if $AUTO_RESTART; then
+            alert WARNING "자동 재시작 시도..."
+            local pid=$(pgrep -f "pretrain.py.*korean_3b" | head -1 || true)
+            if [[ -n "$pid" ]]; then
+                kill -9 "$pid" 2>/dev/null || true
+                sleep 5
+                alert INFO "이전 프로세스 종료. launch_3b_pretrain.sh 재실행 필요."
+            fi
+        fi
+    fi
+}
+# ---- Check: Disk (3B 강화) --------------------------------------------------
+check_disk() {
+    local usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
+    if [[ -n "$usage" && "$usage" -gt "$DISK_CRITICAL_PCT" ]]; then
+        alert CRITICAL "디스크 ${usage}% > ${DISK_CRITICAL_PCT}%! 즉시 정리 필요!"
+        $AUTO_CLEANUP && cleanup_old_checkpoints
+    elif [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
+        alert WARNING "디스크 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 권장."
+    fi
+}
+# ---- Check: GPU utilization -------------------------------------------------
+check_gpu() {
+    command -v nvidia-smi &>/dev/null || return 0
+    local low=0 total=0
+    while IFS= read -r util; do
+        ((total++))
+        [[ "$util" -lt "$GPU_UTIL_WARN" ]] && ((low++))
+    done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
+    [[ $total -gt 0 && $low -gt 0 ]] && alert INFO "${low}/${total} GPU util < ${GPU_UTIL_WARN}%"
+}
+# ---- Check: 체크포인트 무결성 -----------------------------------------------
+check_checkpoint_integrity() {
+    local latest=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
+    [[ -z "$latest" ]] && return 0
+    # 최소 파일 존재 확인
+    if [[ ! -f "${latest}/model.pt" ]] && [[ ! -f "${latest}/model.safetensors" ]]; then
+        alert WARNING "최근 체크포인트에 모델 파일 없음: ${latest}"
+    fi
+    # 크기 확인 (3B model.pt는 최소 2GB)
+    local size=$(du -sb "${latest}" 2>/dev/null | awk '{print $1}')
+    if [[ -n "$size" && "$size" -lt 2000000000 ]]; then
+        alert WARNING "체크포인트 크기 비정상 (${size} bytes < 2GB): ${latest}"
+    fi
+}
+# ---- Cleanup: 오래된 체크포인트 자동 삭제 ------------------------------------
+cleanup_old_checkpoints() {
+    local ckpts=($(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V))
+    local count=${#ckpts[@]}
+    if [[ $count -le $MAX_CHECKPOINTS ]]; then
+        alert OK "체크포인트 ${count}개 ≤ ${MAX_CHECKPOINTS}. 정리 불필요."
+        return
+    fi
+    # 이정표 체크포인트 보존 (매 10K step)
+    local deletable=()
+    local preserved=()
+    for ckpt in "${ckpts[@]}"; do
+        local step_num=$(basename "$ckpt" | grep -oP '\d+' || echo "0")
+        if (( step_num % 10000 == 0 && step_num > 0 )); then
+            preserved+=("$ckpt")
+        else
+            deletable+=("$ckpt")
+        fi
+    done
+    # 최근 MAX_CHECKPOINTS개는 무조건 보존
+    local n_deletable=${#deletable[@]}
+    local total_keep=$(( ${#preserved[@]} + MAX_CHECKPOINTS ))
+    local to_delete=$(( count - total_keep ))
+    [[ $to_delete -le 0 ]] && { alert OK "정리 불필요 (이정표 ${#preserved[@]}개 + 최근 ${MAX_CHECKPOINTS}개 보존)."; return; }
+    alert INFO "${count}개 체크포인트 → ${to_delete}개 삭제 (이정표 ${#preserved[@]}개 영구 보존)"
+    local deleted=0
+    for ckpt in "${deletable[@]}"; do
+        [[ $deleted -ge $to_delete ]] && break
+        local ckpt_size=$(du -sh "$ckpt" 2>/dev/null | awk '{print $1}')
+        echo "  삭제: $ckpt (${ckpt_size})"
+        rm -rf "$ckpt"
+        ((deleted++))
+    done
+    alert OK "체크포인트 정리 완료. (${deleted}개 삭제)"
+}
+# ---- ETA 계산 ---------------------------------------------------------------
+estimate_eta() {
+    [[ ! -f "$LOG_FILE" ]] && return
+    # 최근 step 번호 + 시간
+    local lines=$(parse_metrics 50)
+    [[ -z "$lines" ]] && return
+    local last_line=$(echo "$lines" | tail -1)
+    local first_line=$(echo "$lines" | head -1)
+    local cur_step=$(extract_step "$last_line")
+    local max_steps=$(grep -oP "max_steps.*?(\d+)" "${CKPT_DIR}/train.log" 2>/dev/null | head -1 | grep -oP '\d+$' || echo "57000")
+    [[ -z "$cur_step" || "$cur_step" == "0" ]] && return
+    # step/sec from log timestamps (approximate)
+    local remaining=$((max_steps - cur_step))
+    if [[ $remaining -le 0 ]]; then
+        echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (완료!)${NC}"
+        return
+    fi
+    # 파일 수정 시간 기반 rough ETA
+    local first_time=$(head -20 "$LOG_FILE" 2>/dev/null | grep -oP '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}' | head -1 || true)
+    if [[ -n "$first_time" ]]; then
+        local start_epoch=$(date -d "$first_time" +%s 2>/dev/null || echo 0)
+        local now=$(date +%s)
+        if [[ $start_epoch -gt 0 && $cur_step -gt 0 ]]; then
+            local elapsed=$((now - start_epoch))
+            local sec_per_step=$(echo "$elapsed / $cur_step" | bc -l 2>/dev/null || echo "0")
+            local eta_sec=$(echo "$remaining * $sec_per_step" | bc 2>/dev/null | cut -d. -f1 || echo "0")
+            local eta_hours=$(echo "$eta_sec / 3600" | bc 2>/dev/null || echo "?")
+            local pct=$(echo "scale=1; $cur_step * 100 / $max_steps" | bc 2>/dev/null || echo "?")
+            echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (${pct}%) | 남은 시간: ~${eta_hours}h | ${sec_per_step}s/step${NC}"
+        fi
+    else
+        echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps}${NC}"
+    fi
+}
+# ---- Status summary ---------------------------------------------------------
+print_status() {
+    local lines=$(parse_metrics 1)
+    [[ -n "$lines" ]] && echo -e "${GREEN}최근:${NC} $lines"
+    estimate_eta
+    if command -v nvidia-smi &>/dev/null; then
+        echo -e "${CYAN}GPU:${NC}"
+        nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu \
+            --format=csv,noheader 2>/dev/null | head -8
+    fi
+    local ckpt_count=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | wc -l)
+    local ckpt_size=$(du -sh "${CKPT_DIR}" 2>/dev/null | awk '{print $1}')
+    echo -e "${CYAN}체크포인트:${NC} ${ckpt_count}개 (${ckpt_size})"
+    local disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print $3"/"$2" ("$5")"}')
+    echo -e "${CYAN}디스크:${NC} ${disk}"
+}
+# ---- Main -------------------------------------------------------------------
+echo "=================================================================="
+echo "  3B Training Monitor"
+echo "  Run: ${RUN_NAME}"
+echo "  Log: ${LOG_FILE}"
+echo "  Interval: ${CHECK_INTERVAL}s"
+echo "  Auto-cleanup: ${AUTO_CLEANUP} | Auto-restart: ${AUTO_RESTART}"
+echo "  Ctrl+C to stop"
+echo "=================================================================="
+run_all_checks() {
+    check_loss_zero || true
+    check_loss_spike || true
+    check_gnorm || true
+    check_stall || true
+    check_disk || true
+    check_gpu || true
+    check_checkpoint_integrity || true
+    echo "---"
+    print_status
+    echo ""
+}
+if $CHECK_ONCE; then
+    run_all_checks
+    exit 0
+fi
+while true; do
+    run_all_checks
+    sleep "$CHECK_INTERVAL"
+done

source/scripts/monitor_training.sh ADDED Viewed

	@@ -0,0 +1,244 @@

+#!/usr/bin/env bash
+# =============================================================================
+# monitor_training.sh — SFT 학습 실시간 모니터링 + 이상 감지
+#
+# Usage:
+#   bash scripts/monitor_training.sh                          # 기본 로그 경로
+#   bash scripts/monitor_training.sh /path/to/train.log       # 커스텀 경로
+#   bash scripts/monitor_training.sh --check-once             # 1회 검사 후 종료
+#
+# 감시 항목:
+#   🔴 loss = 0.0000 (3 step 연속) → Labels 버그
+#   🔴 gnorm > 50.0 → 발산 직전
+#   🔴 로그 5분 이상 멈춤 → Hang
+#   🟠 loss spike (3× 이동평균) → Bad batch / LR
+#   🟠 gnorm > 10.0 → 불안정
+#   🟠 디스크 > 80% → 정리 필요
+#   🟡 GPU util < 50% → 병목
+# =============================================================================
+set -euo pipefail
+# ---- Configuration ----------------------------------------------------------
+LOG_FILE="${1:-checkpoints/korean_1b_sft/train.log}"
+CHECK_INTERVAL=30          # 초 단위 폴링 간격
+ZERO_LOSS_THRESHOLD=3      # N회 연속 loss=0이면 경고
+GNORM_WARN=10.0
+GNORM_CRITICAL=50.0
+LOSS_SPIKE_FACTOR=3.0      # 이동평균 대비 N배 이상이면 spike
+STALL_TIMEOUT=300           # 초 (5분) 로그 멈춤 감지
+DISK_WARN_PCT=80
+GPU_UTIL_WARN=50
+CHECK_ONCE=false
+if [[ "${1:-}" == "--check-once" ]]; then
+    CHECK_ONCE=true
+    LOG_FILE="${2:-checkpoints/korean_1b_sft/train.log}"
+fi
+# ---- Colors -----------------------------------------------------------------
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+GREEN='\033[0;32m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+# ---- Helper -----------------------------------------------------------------
+timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
+alert() {
+    local level="$1" msg="$2"
+    case "$level" in
+        CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
+        WARNING)  echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING]  ${msg}${NC}" ;;
+        INFO)     echo -e "${CYAN}🟡 [$(timestamp)] [INFO]     ${msg}${NC}" ;;
+        OK)       echo -e "${GREEN}✅ [$(timestamp)] [OK]       ${msg}${NC}" ;;
+    esac
+}
+# ---- Parse last N log lines -------------------------------------------------
+parse_metrics() {
+    # 로그 형식: [timestamp] [INFO] step  XXXX | loss X.XXXX | lr X.XXe-XX | gnorm X.XXX | ...
+    local n="${1:-20}"
+    if [[ ! -f "$LOG_FILE" ]]; then
+        echo ""
+        return
+    fi
+    tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
+}
+extract_field() {
+    # $1=line, $2=field name (loss, gnorm, lr)
+    echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
+}
+# ---- Check functions --------------------------------------------------------
+check_loss_zero() {
+    local lines
+    lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
+    if [[ -z "$lines" ]]; then return; fi
+    local zero_count=0
+    while IFS= read -r line; do
+        local loss
+        loss=$(extract_field "$line" "loss")
+        if [[ -n "$loss" ]]; then
+            # loss < 0.001
+            if (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
+                ((zero_count++))
+            fi
+        fi
+    done <<< "$lines"
+    if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
+        alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그 가능성. 즉시 학습 중단!"
+        return 1
+    fi
+    return 0
+}
+check_loss_spike() {
+    local lines
+    lines=$(parse_metrics 20)
+    if [[ -z "$lines" ]]; then return 0; fi
+    local losses=()
+    while IFS= read -r line; do
+        local loss
+        loss=$(extract_field "$line" "loss")
+        [[ -n "$loss" ]] && losses+=("$loss")
+    done <<< "$lines"
+    local count=${#losses[@]}
+    if [[ $count -lt 5 ]]; then return 0; fi
+    # 마지막 값과 이전 평균 비교
+    local last_loss="${losses[$((count-1))]}"
+    local sum=0
+    for ((i=0; i<count-1; i++)); do
+        sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
+    done
+    local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
+    if [[ "$avg" != "0" ]]; then
+        local ratio=$(echo "$last_loss / $avg" | bc -l 2>/dev/null || echo "1")
+        if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
+            alert WARNING "Loss spike 감지! 현재=${last_loss}, 평균=${avg}, 비율=${ratio}x"
+        fi
+    fi
+    return 0
+}
+check_gnorm() {
+    local lines
+    lines=$(parse_metrics 5)
+    if [[ -z "$lines" ]]; then return 0; fi
+    local last_line
+    last_line=$(echo "$lines" | tail -1)
+    local gnorm
+    gnorm=$(extract_field "$last_line" "gnorm")
+    if [[ -z "$gnorm" ]]; then return 0; fi
+    if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
+        alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전. 학습 중단 고려."
+    elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
+        alert WARNING "GNorm=${gnorm} > ${GNORM_WARN}. 불안정 징후."
+    fi
+    return 0
+}
+check_stall() {
+    if [[ ! -f "$LOG_FILE" ]]; then
+        alert INFO "로그 파일 없음: ${LOG_FILE}"
+        return 0
+    fi
+    local last_modified
+    last_modified=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
+    local now
+    now=$(date +%s)
+    local diff=$((now - last_modified))
+    if [[ $diff -gt $STALL_TIMEOUT ]]; then
+        alert CRITICAL "로그가 ${diff}초 ($(( diff/60 ))분) 동안 업데이트 없음! Hang 가능성."
+    fi
+    return 0
+}
+check_disk() {
+    local usage
+    usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
+    if [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
+        alert WARNING "디스크 사용률 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 필요."
+    fi
+    return 0
+}
+check_gpu() {
+    if ! command -v nvidia-smi &>/dev/null; then return 0; fi
+    local low_util=0
+    local total_gpus=0
+    while IFS= read -r util; do
+        ((total_gpus++))
+        if [[ "$util" -lt "$GPU_UTIL_WARN" ]]; then
+            ((low_util++))
+        fi
+    done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
+    if [[ $total_gpus -gt 0 && $low_util -gt 0 ]]; then
+        alert INFO "${low_util}/${total_gpus} GPU utilization < ${GPU_UTIL_WARN}%. 데이터 로딩 병목?"
+    fi
+    return 0
+}
+# ---- Status summary ---------------------------------------------------------
+print_status() {
+    local lines
+    lines=$(parse_metrics 1)
+    if [[ -n "$lines" ]]; then
+        echo -e "${GREEN}최근 로그:${NC} $lines"
+    fi
+    if command -v nvidia-smi &>/dev/null; then
+        echo -e "${CYAN}GPU 메모리:${NC}"
+        nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu \
+            --format=csv,noheader 2>/dev/null | head -8
+    fi
+    local disk
+    disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print "사용: "$3"/"$2" ("$5")"}')
+    echo -e "${CYAN}디스크:${NC} ${disk}"
+}
+# ---- Main loop --------------------------------------------------------------
+echo "=================================================================="
+echo "  SFT Training Monitor"
+echo "  Log file: ${LOG_FILE}"
+echo "  Check interval: ${CHECK_INTERVAL}s"
+echo "  Press Ctrl+C to stop"
+echo "=================================================================="
+run_all_checks() {
+    check_loss_zero || true
+    check_loss_spike || true
+    check_gnorm || true
+    check_stall || true
+    check_disk || true
+    check_gpu || true
+    echo "---"
+    print_status
+    echo ""
+}
+if $CHECK_ONCE; then
+    run_all_checks
+    exit 0
+fi
+while true; do
+    run_all_checks
+    sleep "$CHECK_INTERVAL"
+done

source/scripts/openclaw-watchdog.sh ADDED Viewed

	@@ -0,0 +1,243 @@

+#!/usr/bin/env bash
+#
+# openclaw-watchdog.sh — OpenClaw Gateway 헬스체크 + 자동 재시작
+# crontab에 등록하여 1분마다 실행
+#
+# Usage:
+#   */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/openclaw-watchdog.sh
+#
+# 변경이력:
+#   2026-03-01  네트워크 체크를 ICMP→HTTP로 변경 (ICMP 차단 환경 대응)
+#              다중 엔드포인트 fallback, 게이트웨이 HTTP 응답 체크 추가
+#              setsid 분리 실행, 상세 로깅 강화
+set -euo pipefail
+# ── 설정 ──────────────────────────────────────────────
+RNTIER_HOME="REDACTED_RNTIER_PATH"
+OPENCLAW_BIN="${RNTIER_HOME}/.npm-global/bin/openclaw"
+GATEWAY_PORT=18789
+GATEWAY_HOST="127.0.0.1"
+PID_FILE="/tmp/openclaw-gateway.pid"
+LOG_DIR="/tmp/openclaw"
+LOG_FILE="${LOG_DIR}/watchdog.log"
+GATEWAY_LOG="${LOG_DIR}/gateway.log"
+MAX_LOG_SIZE=$((10 * 1024 * 1024))  # 10MB 로테이션
+RESTART_COOLDOWN=120  # 초 — 재시작 후 이 시간 내 재시도 방지
+LAST_RESTART_FILE="/tmp/openclaw-last-restart"
+CONSECUTIVE_FAIL_FILE="/tmp/openclaw-consecutive-fails"
+# 환경변수 — openclaw가 config를 찾을 수 있도록
+export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
+export HOME="/home/ghong"
+export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
+export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
+# ── 함수 ──────────────────────────────────────────────
+mkdir -p "$LOG_DIR"
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
+}
+rotate_log() {
+    local file="$1"
+    if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
+        mv "$file" "${file}.old"
+        log "Log rotated: $file"
+    fi
+}
+# 게이트웨이의 실제 엔드포인트로 로컬 HTTP 응답 체크
+check_gateway_http() {
+    if command -v curl &>/dev/null; then
+        curl -sf --max-time 5 -o /dev/null "http://${GATEWAY_HOST}:${GATEWAY_PORT}/__openclaw__/canvas/" 2>/dev/null
+        return $?
+    fi
+    return 1
+}
+is_port_open() {
+    if command -v ss &>/dev/null; then
+        ss -tlnH "sport = :${GATEWAY_PORT}" 2>/dev/null | grep -q "$GATEWAY_PORT"
+    else
+        (echo > /dev/tcp/"$GATEWAY_HOST"/"$GATEWAY_PORT") 2>/dev/null
+    fi
+}
+is_process_alive() {
+    if [[ -f "$PID_FILE" ]]; then
+        local pid
+        pid=$(cat "$PID_FILE" 2>/dev/null)
+        if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
+            return 0
+        fi
+    fi
+    pgrep -f "openclaw.*gateway" >/dev/null 2>&1
+}
+# 네트워크 체크 — DNS 해석 기반
+# 이 서버는 ICMP(ping)과 아웃바운드 HTTPS(curl)가 모두 차단됨.
+# 단, DNS 해석은 가능하고 게이트웨이(Node.js)는 long-polling으로 통신 가능.
+# 따라서 DNS 해석 성공 여부로 "네트워크 자체가 살아있는지" 판단한다.
+check_network() {
+    # 방법1: getent (가장 빠르고 가벼움)
+    if command -v getent &>/dev/null; then
+        getent hosts api.telegram.org >/dev/null 2>&1 && return 0
+        getent hosts api.anthropic.com >/dev/null 2>&1 && return 0
+    fi
+    # 방법2: nslookup
+    if command -v nslookup &>/dev/null; then
+        nslookup -timeout=5 api.telegram.org >/dev/null 2>&1 && return 0
+    fi
+    # 방법3: /dev/tcp 로 DNS 서버(168.126.63.1) 포트 53 확인
+    (echo > /dev/tcp/168.126.63.1/53) 2>/dev/null && return 0
+    return 1
+}
+cooldown_active() {
+    if [[ -f "$LAST_RESTART_FILE" ]]; then
+        local last_restart now diff
+        last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
+        now=$(date +%s)
+        diff=$(( now - last_restart ))
+        if [[ $diff -lt $RESTART_COOLDOWN ]]; then
+            return 0  # 쿨다운 중
+        fi
+    fi
+    return 1  # 쿨다운 아님
+}
+get_consecutive_fails() {
+    if [[ -f "$CONSECUTIVE_FAIL_FILE" ]]; then
+        cat "$CONSECUTIVE_FAIL_FILE" 2>/dev/null || echo 0
+    else
+        echo 0
+    fi
+}
+set_consecutive_fails() {
+    echo "$1" > "$CONSECUTIVE_FAIL_FILE"
+}
+start_gateway() {
+    log "ACTION: Starting OpenClaw gateway on port $GATEWAY_PORT..."
+    # 기존 좀비 프로세스 정리
+    local old_pids
+    old_pids=$(pgrep -f "openclaw.*gateway" 2>/dev/null || true)
+    if [[ -n "$old_pids" ]]; then
+        log "ACTION: Killing stale gateway processes: $old_pids"
+        echo "$old_pids" | xargs kill -9 2>/dev/null || true
+        sleep 2
+    fi
+    # 게이트웨이 시작 — setsid로 완전 분리 (부모 프로세스 시그널 전파 방지)
+    setsid nohup "$OPENCLAW_BIN" gateway run \
+        --port "$GATEWAY_PORT" \
+        --bind loopback \
+        >> "$GATEWAY_LOG" 2>&1 < /dev/null &
+    local new_pid=$!
+    echo "$new_pid" > "$PID_FILE"
+    date +%s > "$LAST_RESTART_FILE"
+    log "ACTION: Gateway launched with PID $new_pid (setsid)"
+    # 8초 대기 후 확인 (Telegram provider 초기화에 시간 필요)
+    sleep 8
+    if kill -0 "$new_pid" 2>/dev/null; then
+        log "OK: Gateway PID $new_pid is alive after startup"
+        if is_port_open; then
+            log "OK: Port $GATEWAY_PORT is listening"
+        else
+            log "WARN: Gateway alive but port $GATEWAY_PORT not yet listening (may need more time)"
+        fi
+        return 0
+    else
+        log "ERROR: Gateway PID $new_pid died immediately after start"
+        log "ERROR: Last 10 lines of gateway.log:"
+        tail -10 "$GATEWAY_LOG" 2>/dev/null | while read -r line; do
+            log "  | $line"
+        done
+        return 1
+    fi
+}
+# ── 메인 로직 ─────────────────────────────────────────
+rotate_log "$LOG_FILE"
+rotate_log "$GATEWAY_LOG"
+# 오래된 openclaw 로그 파일 정리 (7일 이상)
+find "$LOG_DIR" -name "openclaw-*.log" -mtime +7 -delete 2>/dev/null || true
+# 1) 프로세스 + 포트 체크를 먼저 수행 (게이트웨이가 살아있으면 네트워크 체크 불필요)
+process_ok=false
+port_ok=false
+http_ok=false
+if is_process_alive; then
+    process_ok=true
+fi
+if is_port_open; then
+    port_ok=true
+fi
+if $port_ok && check_gateway_http; then
+    http_ok=true
+fi
+# 2) 게이트웨이 정상이면 바로 종료
+if $process_ok && $port_ok; then
+    if $http_ok; then
+        # 완전 정상
+        set_consecutive_fails 0
+        exit 0
+    fi
+    # 프로세스+포트 OK인데 HTTP 응답 없음 → hung 가능성
+    fails=$(get_consecutive_fails)
+    fails=$((fails + 1))
+    set_consecutive_fails "$fails"
+    log "WARN: Process alive, port open, but HTTP not responding (consecutive: $fails)"
+    if [[ $fails -lt 3 ]]; then
+        log "INFO: Waiting more cycles before restart (transient check, $fails/3)"
+        exit 0
+    fi
+    log "WARN: HTTP unresponsive for $fails consecutive checks — proceeding to restart"
+fi
+# 3) 게이트웨이가 비정상 — 네트워크 체크 후 재시작 여부 판단
+if $process_ok && ! $port_ok; then
+    log "WARN: Process alive but port $GATEWAY_PORT not listening. Possible hung state."
+fi
+if ! $process_ok && ! $port_ok; then
+    log "WARN: Gateway is completely down (no process, no port)."
+fi
+if ! $process_ok && $port_ok; then
+    log "WARN: No known gateway process but port $GATEWAY_PORT is in use. Stale process?"
+fi
+# 4) 네트워크 체크 — DNS 기반 (게이트웨이가 죽었을 때만 실행)
+if ! check_network; then
+    log "WARN: Network unreachable (DNS resolution failed). Skipping gateway restart."
+    exit 0
+fi
+# 5) 쿨다운 체크
+if cooldown_active; then
+    log "INFO: Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
+    exit 0
+fi
+# 6) 재시작
+log "ACTION: Attempting gateway restart..."
+if start_gateway; then
+    log "OK: Gateway restart SUCCESS"
+    set_consecutive_fails 0
+else
+    log "ERROR: Gateway restart FAILED"
+    exit 1
+fi

source/scripts/orpo_eval_watchdog.sh ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/bin/bash
+# =============================================================================
+# ORPO Training Completion Watchdog
+# =============================================================================
+# Monitors the ORPO training process. When it finishes, automatically launches
+# the comprehensive evaluation pipeline.
+#
+# Usage:
+#   nohup bash scripts/orpo_eval_watchdog.sh > checkpoints/korean_3b_orpo_v1/watchdog.log 2>&1 &
+# =============================================================================
+set -euo pipefail
+PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
+TRAIN_LOG="${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/train.log"
+TRAIN_PID=$(pgrep -f "train/orpo.py.*korean_3b_orpo_v1" | head -1)
+echo "=============================================="
+echo "  ORPO Eval Watchdog Started"
+echo "=============================================="
+echo "  Time      : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "  Train PID : ${TRAIN_PID:-NOT FOUND}"
+echo "  Train Log : ${TRAIN_LOG}"
+echo "=============================================="
+if [ -z "${TRAIN_PID}" ]; then
+    echo "[WARN] Training process not found. Checking if already completed..."
+    # Check if training already finished by looking for final output
+    if grep -q "Training completed" "${TRAIN_LOG}" 2>/dev/null || \
+       grep -q "Saving model checkpoint" "${TRAIN_LOG}" 2>/dev/null; then
+        echo "[INFO] Training appears to have already completed."
+    else
+        echo "[ERROR] No training process and no completion marker found. Exiting."
+        exit 1
+    fi
+else
+    echo "[INFO] Watching training PID ${TRAIN_PID}..."
+    echo ""
+    # Poll every 60 seconds
+    while kill -0 "${TRAIN_PID}" 2>/dev/null; do
+        # Get current step
+        CURRENT_STEP=$(grep -oP '\d+/9840' "${TRAIN_LOG}" 2>/dev/null | tail -1 || echo "?/?")
+        LATEST_LOSS=$(grep "'loss':" "${TRAIN_LOG}" 2>/dev/null | tail -1 | grep -oP "'loss': '([^']+)'" | sed "s/'loss': '//;s/'//" || echo "?")
+        echo "[$(date '+%H:%M:%S')] Step ${CURRENT_STEP} | Loss: ${LATEST_LOSS} | PID ${TRAIN_PID} running"
+        sleep 60
+    done
+    echo ""
+    echo "=============================================="
+    echo "[INFO] Training process ${TRAIN_PID} has ended."
+    echo "[INFO] Time: $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "=============================================="
+fi
+# Wait a moment for any final I/O
+sleep 10
+# Get final training stats
+echo ""
+echo "[INFO] Final training stats:"
+grep "eval_loss" "${TRAIN_LOG}" | tail -1 | tr ',' '\n' | head -10
+echo ""
+# Detect the latest checkpoint
+LATEST_CKPT=$(ls -d ${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/checkpoint-* 2>/dev/null | sort -t- -k2 -n | tail -1)
+echo "[INFO] Latest checkpoint: ${LATEST_CKPT}"
+if [ -z "${LATEST_CKPT}" ]; then
+    echo "[ERROR] No checkpoint found. Cannot proceed with evaluation."
+    exit 1
+fi
+# Send telegram notification (if available)
+python3 -c "
+import os, urllib.request, urllib.parse, json
+token = os.environ.get('TELEGRAM_BOT_TOKEN', '')
+chat_id = os.environ.get('TELEGRAM_CHAT_ID', '')
+if token and chat_id:
+    msg = '🏁 ORPO 학습 완료! 자동 평가 시작합니다.\nCheckpoint: ${LATEST_CKPT##*/}'
+    url = f'https://api.telegram.org/bot{token}/sendMessage'
+    data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode()
+    urllib.request.urlopen(url, data, timeout=10)
+    print('[INFO] Telegram notification sent.')
+else:
+    print('[INFO] Telegram not configured, skipping notification.')
+" 2>/dev/null || true
+# ============================================================================
+# Launch evaluation pipeline
+# ============================================================================
+echo ""
+echo "=============================================="
+echo "  Starting ORPO Evaluation Pipeline"
+echo "  Time: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "=============================================="
+cd "${PROJECT_ROOT}"
+python3 eval/orpo_eval_pipeline.py \
+    --checkpoint "${LATEST_CKPT}" \
+    2>&1 | tee -a checkpoints/korean_3b_orpo_v1/eval.log
+EVAL_EXIT=$?
+echo ""
+echo "=============================================="
+echo "  Evaluation Complete"
+echo "  Exit code: ${EVAL_EXIT}"
+echo "  Time: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "=============================================="
+# Send completion notification
+python3 -c "
+import os, urllib.request, urllib.parse
+token = os.environ.get('TELEGRAM_BOT_TOKEN', '')
+chat_id = os.environ.get('TELEGRAM_CHAT_ID', '')
+if token and chat_id:
+    exit_code = ${EVAL_EXIT}
+    status = '✅ 성공' if exit_code == 0 else '❌ 실패'
+    msg = f'ORPO 평가 완료: {status}\nExit code: {exit_code}\n보고서: reports/ 확인'
+    url = f'https://api.telegram.org/bot{token}/sendMessage'
+    data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode()
+    urllib.request.urlopen(url, data, timeout=10)
+" 2>/dev/null || true
+exit ${EVAL_EXIT}

source/scripts/orpo_hp_sweep.sh ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env bash
+# =============================================================================
+# orpo_hp_sweep.sh — ORPO Hyperparameter Sweep (200 steps each)
+#
+# 각 설정을 200 steps씩 돌려서 최적 조합을 찾는 스크립트.
+# 결과는 sweep_results/ 디렉토리에 저장됨.
+#
+# Usage:
+#   bash scripts/orpo_hp_sweep.sh           # 전체 sweep (6 runs)
+#   bash scripts/orpo_hp_sweep.sh --dry-run # 설정만 출력
+# =============================================================================
+set -uo pipefail
+# NOTE: set +e — individual runs may fail; we log failures and continue the sweep
+cd "$(dirname "$0")/.."
+SWEEP_STEPS=200
+SWEEP_DIR="checkpoints/orpo_sweep"
+RESULTS_FILE="${SWEEP_DIR}/sweep_results.jsonl"
+BASE_MODEL="eval/outputs/hf_3b_sft_best"
+DATA_PATH="data/preference/combined_preference.jsonl"
+NPROC=8
+MASTER_PORT_BASE=29510
+# B200 NCCL tuning (NVSwitch mesh — let NCCL auto-detect proto/channels/algo)
+export NCCL_IB_DISABLE=1
+export NCCL_BUFFSIZE=134217728
+export OMP_NUM_THREADS=9
+export MKL_NUM_THREADS=9
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export NCCL_P2P_LEVEL=NVL
+export PYTHONWARNINGS="ignore::UserWarning:torch.library"
+mkdir -p "${SWEEP_DIR}"
+declare -a FAILED_RUNS=()
+# ---------------------------------------------------------------------------
+# Sweep configurations: (name, beta, lr, max_length, batch_size, grad_accum)
+# ---------------------------------------------------------------------------
+# 핵심 탐색 축:
+#   1. beta: 반복 억제 강도 (0.15 vs 0.25 vs 0.35)
+#   2. lr: 수렴 속도 (5e-6 vs 8e-6 vs 1.2e-5)
+#   3. max_length: VRAM vs 커버리지 (1024 vs 1536)
+declare -a CONFIGS=(
+    # name                beta   lr      max_len bs  accum
+    "baseline_b015_lr8e6  0.15   8e-6    1536    4   4"
+    "baseline_b025_lr8e6  0.25   8e-6    1536    4   4"
+    "strong_b035_lr8e6    0.35   8e-6    1536    4   4"
+    "fast_b025_lr12e6     0.25   1.2e-5  1536    4   4"
+    "conserv_b025_lr5e6   0.25   5e-6    1536    4   4"
+    "short_b025_lr8e6     0.25   8e-6    1024    4   4"
+)
+DRY_RUN=false
+if [[ "${1:-}" == "--dry-run" ]]; then
+    DRY_RUN=true
+fi
+echo "=================================================================="
+echo "  ORPO Hyperparameter Sweep"
+echo "  Configs: ${#CONFIGS[@]}"
+echo "  Steps each: ${SWEEP_STEPS}"
+echo "  Results: ${RESULTS_FILE}"
+echo "=================================================================="
+for i in "${!CONFIGS[@]}"; do
+    read -r NAME BETA LR MAX_LEN BS ACCUM <<< "${CONFIGS[$i]}"
+    PORT=$((MASTER_PORT_BASE + i))
+    OUTPUT="${SWEEP_DIR}/${NAME}"
+    echo ""
+    echo "--- Run $((i+1))/${#CONFIGS[@]}: ${NAME} ---"
+    echo "    beta=${BETA} lr=${LR} max_length=${MAX_LEN} bs=${BS} accum=${ACCUM}"
+    if [[ "${DRY_RUN}" == "true" ]]; then
+        echo "    [DRY RUN] skipping"
+        continue
+    fi
+    mkdir -p "${OUTPUT}"
+    START_TIME=$(date +%s)
+    torchrun \
+        --nproc_per_node=${NPROC} \
+        --master_port=${PORT} \
+        train/orpo.py \
+        --model_path "${BASE_MODEL}" \
+        --custom_data_path "${DATA_PATH}" \
+        --output_dir "${OUTPUT}" \
+        --max_steps ${SWEEP_STEPS} \
+        --lr ${LR} \
+        --beta ${BETA} \
+        --batch_size ${BS} \
+        --gradient_accumulation_steps ${ACCUM} \
+        --max_length ${MAX_LEN} \
+        \
+        --weight_decay 0.01 \
+        --warmup_ratio 0.05 \
+        --eval_split_ratio 0.05 \
+        --eval_steps 100 \
+        --early_stopping_patience 100 \
+        --save_steps 200 \
+        --save_total_limit 1 \
+        --logging_steps 10 \
+        --report_to none \
+        --dataset_num_proc 64 \
+        --dataloader_num_workers 4 \
+        --no_load_best \
+        2>&1 | tee "${OUTPUT}/train.log"
+    RUN_EXIT=$?
+    END_TIME=$(date +%s)
+    ELAPSED=$((END_TIME - START_TIME))
+    if [[ ${RUN_EXIT} -ne 0 ]]; then
+        echo "    [ERROR] Run ${NAME} failed with exit code ${RUN_EXIT} after ${ELAPSED}s"
+        echo "{\"name\":\"${NAME}\",\"beta\":${BETA},\"lr\":\"${LR}\",\"max_length\":${MAX_LEN},\"status\":\"FAILED\",\"exit_code\":${RUN_EXIT},\"elapsed_s\":${ELAPSED}}" >> "${RESULTS_FILE}"
+        FAILED_RUNS+=("${NAME}")
+        continue
+    fi
+    # Extract final metrics from log
+    FINAL_LOSS=$(grep -oP "'loss': '[\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[\d.]+" || echo "N/A")
+    EVAL_LOSS=$(grep -oP "'eval_loss': '[\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[\d.]+" || echo "N/A")
+    MARGIN=$(grep -oP "'rewards/margins': '[-\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[-\d.]+" || echo "N/A")
+    # Save result
+    echo "{\"name\":\"${NAME}\",\"beta\":${BETA},\"lr\":\"${LR}\",\"max_length\":${MAX_LEN},\"status\":\"OK\",\"loss\":\"${FINAL_LOSS}\",\"eval_loss\":\"${EVAL_LOSS}\",\"margin\":\"${MARGIN}\",\"elapsed_s\":${ELAPSED}}" >> "${RESULTS_FILE}"
+    echo "    -> loss=${FINAL_LOSS} eval_loss=${EVAL_LOSS} margin=${MARGIN} time=${ELAPSED}s"
+    # Cleanup weights to save disk (keep logs)
+    rm -rf "${OUTPUT}/checkpoint-"* "${OUTPUT}/emergency_checkpoint" 2>/dev/null || true
+done
+echo ""
+echo "=================================================================="
+echo "  Sweep Complete!"
+echo "  Results: ${RESULTS_FILE}"
+if [[ -f "${RESULTS_FILE}" ]]; then
+    echo ""
+    echo "  Summary:"
+    cat "${RESULTS_FILE}" | python3 -c "
+import sys, json
+results = [json.loads(l) for l in sys.stdin]
+results.sort(key=lambda r: float(r.get('eval_loss', '999')))
+print(f'  {\"Name\":<25} {\"Beta\":>6} {\"LR\":>10} {\"Loss\":>8} {\"EvalLoss\":>10} {\"Margin\":>8} {\"Time\":>6}')
+print(f'  {\"-\"*25} {\"-\"*6} {\"-\"*10} {\"-\"*8} {\"-\"*10} {\"-\"*8} {\"-\"*6}')
+for r in results:
+    print(f'  {r[\"name\"]:<25} {r[\"beta\"]:>6} {r[\"lr\"]:>10} {r[\"loss\"]:>8} {r[\"eval_loss\"]:>10} {r[\"margin\"]:>8} {r[\"elapsed_s\"]:>5}s')
+print()
+best = results[0]
+print(f'  BEST: {best[\"name\"]} (eval_loss={best[\"eval_loss\"]})')
+" 2>/dev/null || cat "${RESULTS_FILE}"
+fi
+# Report failed runs
+if [[ ${#FAILED_RUNS[@]} -gt 0 ]]; then
+    echo ""
+    echo "  FAILED RUNS (${#FAILED_RUNS[@]}):"
+    for fname in "${FAILED_RUNS[@]}"; do
+        echo "    - ${fname}"
+    done
+fi
+echo "=================================================================="

source/scripts/prepare_3b_data.sh ADDED Viewed

	@@ -0,0 +1,414 @@

+#!/usr/bin/env bash
+# =============================================================================
+# prepare_3b_data.sh — 3B 모델 학습 데이터 전체 파이프라인
+#
+# 사용법:
+#   bash scripts/prepare_3b_data.sh [--step N] [--jobs 72]
+#
+# 스텝:
+#   1 = CulturaX 토큰화
+#   2 = cc100 해제 + 토큰화
+#   3 = OSCAR 토큰화
+#   4 = korean_webtext 토큰화
+#   5 = HPLT 한국어 추출 + 토큰화
+#   6 = textbooks + finepdfs + kovast 토큰화
+#   7 = 전체 병합
+#   8 = train/val split 검증
+# =============================================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+cd "${PROJECT_ROOT}"
+# ─── 설정 ────────────────────────────────────────────────────────────────
+DATA_DIR="data"
+EXTRA_DIR="data/korean_extra"
+TOKENIZER="tokenizer/tokenizer.json"
+VAL_SPLIT=0.002
+SEED=42
+JOBS=72
+FROM_STEP=0
+LOG_FILE="data/prepare_3b.log"
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --step)   FROM_STEP="$2"; shift 2 ;;
+        --jobs)   JOBS="$2"; shift 2 ;;
+        *)        echo "Unknown arg: $1"; exit 1 ;;
+    esac
+done
+mkdir -p "$(dirname "$LOG_FILE")"
+exec > >(tee -a "$LOG_FILE") 2>&1
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
+# ─── 토큰화 헬퍼 (parquet → bin) ─────────────────────────────────────────
+tokenize_parquet() {
+    local name="$1"
+    local input_pattern="$2"
+    local text_col="$3"
+    local output="${DATA_DIR}/${name}_train.bin"
+    if [[ -f "$output" && $FROM_STEP -le 0 ]]; then
+        log "[SKIP] $output already exists ($(du -h "$output" | cut -f1))"
+        return
+    fi
+    log "[START] Tokenizing $name from parquet..."
+    python3 - <<PYEOF
+import glob, os, sys
+import numpy as np
+from tokenizers import Tokenizer
+import pyarrow.parquet as pq
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+import multiprocessing as mp
+tokenizer_path = "${TOKENIZER}"
+input_pattern = "${input_pattern}"
+text_col = "${text_col}"
+output_train = "${output}"
+output_val = output_train.replace("_train.bin", "_val.bin")
+val_split = ${VAL_SPLIT}
+seed = ${SEED}
+files = sorted(glob.glob(input_pattern))
+print(f"Found {len(files)} parquet files")
+tokenizer = Tokenizer.from_file(tokenizer_path)
+all_tokens = []
+total_docs = 0
+for f in tqdm(files, desc="${name}"):
+    try:
+        table = pq.read_table(f, columns=[text_col])
+        for text in table.column(text_col):
+            t = text.as_py()
+            if t and len(t) > 50:
+                ids = tokenizer.encode(t).ids
+                all_tokens.extend(ids)
+                total_docs += 1
+    except Exception as e:
+        print(f"Error processing {f}: {e}", file=sys.stderr)
+        continue
+print(f"Total: {total_docs:,} docs, {len(all_tokens):,} tokens")
+# Split
+import random
+random.seed(seed)
+random.shuffle(all_tokens)  # Not ideal but matches existing code
+n_val = int(len(all_tokens) * val_split)
+val_tokens = all_tokens[:n_val]
+train_tokens = all_tokens[n_val:]
+np.array(train_tokens, dtype=np.uint16).tofile(output_train)
+np.array(val_tokens, dtype=np.uint16).tofile(output_val)
+print(f"Saved: {output_train} ({len(train_tokens):,} tokens)")
+print(f"Saved: {output_val} ({len(val_tokens):,} tokens)")
+PYEOF
+    log "[DONE] $name → $output"
+}
+# ─── Step 1: CulturaX ────────────────────────────────────────────────────
+if [[ $FROM_STEP -le 1 ]]; then
+    log "=== Step 1: CulturaX 토큰화 ==="
+    tokenize_parquet "culturax" \
+        "${EXTRA_DIR}/culturax_ko/ko/*.parquet" \
+        "text"
+fi
+# ─── Step 2: cc100 해제 + 토큰화 ─────────────────────────────────────────
+if [[ $FROM_STEP -le 2 ]]; then
+    log "=== Step 2: cc100 해제 + 토큰화 ==="
+    CC100_XZ="${EXTRA_DIR}/cc100_ko/ko.txt.xz"
+    CC100_TXT="${EXTRA_DIR}/cc100_ko/ko.txt"
+    CC100_OUT="${DATA_DIR}/cc100_train.bin"
+    if [[ -f "$CC100_OUT" && $FROM_STEP -le 0 ]]; then
+        log "[SKIP] cc100 already tokenized"
+    else
+        # 해제
+        if [[ ! -f "$CC100_TXT" ]]; then
+            log "Decompressing cc100 xz (14GB → 54GB)..."
+            xz -dk "$CC100_XZ"
+            log "Decompression done"
+        fi
+        # 토큰화 (대용량 → 스트리밍)
+        log "Tokenizing cc100 (54GB text)..."
+        python3 - <<'PYEOF'
+import numpy as np
+from tokenizers import Tokenizer
+from tqdm import tqdm
+import random
+tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
+input_file = "data/korean_extra/cc100_ko/ko.txt"
+output_train = "data/cc100_train.bin"
+output_val = "data/cc100_val.bin"
+# Stream tokenize in chunks
+all_tokens = []
+doc_buffer = []
+doc_count = 0
+with open(input_file, 'r', encoding='utf-8', errors='replace') as f:
+    for line in tqdm(f, desc="cc100", unit=" lines"):
+        line = line.strip()
+        if not line:
+            # Document boundary
+            if doc_buffer:
+                text = '\n'.join(doc_buffer)
+                if len(text) > 50:
+                    ids = tokenizer.encode(text).ids
+                    all_tokens.extend(ids)
+                    doc_count += 1
+                doc_buffer = []
+        else:
+            doc_buffer.append(line)
+    # Last doc
+    if doc_buffer:
+        text = '\n'.join(doc_buffer)
+        if len(text) > 50:
+            all_tokens.extend(tokenizer.encode(text).ids)
+            doc_count += 1
+print(f"Total: {doc_count:,} docs, {len(all_tokens):,} tokens")
+# Split
+n_val = int(len(all_tokens) * 0.002)
+np.array(all_tokens[n_val:], dtype=np.uint16).tofile(output_train)
+np.array(all_tokens[:n_val], dtype=np.uint16).tofile(output_val)
+print(f"Saved train: {len(all_tokens)-n_val:,} tokens")
+print(f"Saved val: {n_val:,} tokens")
+PYEOF
+        log "[DONE] cc100"
+    fi
+fi
+# ─── Step 3: OSCAR ───────────────────────────────────────────────────────
+if [[ $FROM_STEP -le 3 ]]; then
+    log "=== Step 3: OSCAR 토큰화 ==="
+    OSCAR_OUT="${DATA_DIR}/oscar_train.bin"
+    if [[ -f "$OSCAR_OUT" && $FROM_STEP -le 0 ]]; then
+        log "[SKIP] OSCAR already tokenized"
+    else
+        python3 - <<'PYEOF'
+import glob, numpy as np
+from tokenizers import Tokenizer
+import pyarrow.parquet as pq
+from tqdm import tqdm
+tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
+files = sorted(glob.glob("data/korean_extra/oscar_ko/data/kor_Hang/*.parquet"))
+all_tokens = []
+doc_count = 0
+for f in tqdm(files, desc="OSCAR"):
+    table = pq.read_table(f, columns=['text'])
+    for row in table.column('text'):
+        if row is None:
+            continue
+        parts = row.as_py()
+        if parts:
+            text = '\n'.join(item['text'] for item in parts if item and item.get('text'))
+            if len(text) > 50:
+                all_tokens.extend(tokenizer.encode(text).ids)
+                doc_count += 1
+print(f"OSCAR: {doc_count:,} docs, {len(all_tokens):,} tokens")
+n_val = int(len(all_tokens) * 0.002)
+np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/oscar_train.bin")
+np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/oscar_val.bin")
+PYEOF
+        log "[DONE] OSCAR"
+    fi
+fi
+# ─── Step 4: korean_webtext ──────────────────────────────────────────────
+if [[ $FROM_STEP -le 4 ]]; then
+    log "=== Step 4: korean_webtext 토큰화 ==="
+    tokenize_parquet "webtext" \
+        "${EXTRA_DIR}/korean_webtext/data/*.parquet" \
+        "text"
+fi
+# ─── Step 5: HPLT 한국어 추출 + 토큰화 ──────────────────────────────────
+if [[ $FROM_STEP -le 5 ]]; then
+    log "=== Step 5: HPLT 한국어 추출 + 토큰화 ==="
+    HPLT_OUT="${DATA_DIR}/hplt_ko_train.bin"
+    if [[ -f "$HPLT_OUT" && $FROM_STEP -le 0 ]]; then
+        log "[SKIP] HPLT already tokenized"
+    else
+        python3 - <<'PYEOF'
+import glob, numpy as np
+from tokenizers import Tokenizer
+import pyarrow.parquet as pq
+from tqdm import tqdm
+tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
+files = sorted(glob.glob("data/korean_extra/hplt_ko/en-ko/*.parquet"))
+all_tokens = []
+doc_count = 0
+for f in tqdm(files, desc="HPLT"):
+    table = pq.read_table(f, columns=['tgt_doc'])
+    for row in table.column('tgt_doc'):
+        d = row.as_py()
+        if d and d.get('sentences'):
+            text = '\n'.join(s for s in d['sentences'] if s)
+            if len(text) > 50:
+                all_tokens.extend(tokenizer.encode(text).ids)
+                doc_count += 1
+print(f"HPLT Korean: {doc_count:,} docs, {len(all_tokens):,} tokens")
+n_val = int(len(all_tokens) * 0.002)
+np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/hplt_ko_train.bin")
+np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/hplt_ko_val.bin")
+PYEOF
+        log "[DONE] HPLT"
+    fi
+fi
+# ─── Step 6: textbooks + finepdfs + kovast ───────────────────────────────
+if [[ $FROM_STEP -le 6 ]]; then
+    log "=== Step 6: 기타 소스 토큰화 ==="
+    EXTRA_OUT="${DATA_DIR}/extra_misc_train.bin"
+    if [[ -f "$EXTRA_OUT" && $FROM_STEP -le 0 ]]; then
+        log "[SKIP] extra_misc already tokenized"
+    else
+        python3 - <<'PYEOF'
+import glob, numpy as np, os
+from tokenizers import Tokenizer
+import pyarrow.parquet as pq
+from tqdm import tqdm
+tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
+all_tokens = []
+doc_count = 0
+# korean_textbooks (MMLU-style: look for text columns)
+tb_files = glob.glob("data/korean_extra/korean_textbooks/**/*.parquet", recursive=True)
+for f in tqdm(tb_files, desc="textbooks"):
+    try:
+        table = pq.read_table(f)
+        # Try common text columns
+        for col in ['question', 'text', 'input', 'instruction']:
+            if col in table.column_names:
+                for val in table.column(col):
+                    t = val.as_py()
+                    if t and len(t) > 20:
+                        all_tokens.extend(tokenizer.encode(t).ids)
+                        doc_count += 1
+                break
+    except:
+        continue
+# finepdfs
+pdf_files = glob.glob("data/korean_extra/finepdfs_edu_ko/*.parquet")
+for f in tqdm(pdf_files, desc="finepdfs"):
+    try:
+        table = pq.read_table(f)
+        for col in ['text', 'content']:
+            if col in table.column_names:
+                for val in table.column(col):
+                    t = val.as_py()
+                    if t and len(t) > 50:
+                        all_tokens.extend(tokenizer.encode(t).ids)
+                        doc_count += 1
+                break
+    except:
+        continue
+print(f"Extra: {doc_count:,} docs, {len(all_tokens):,} tokens")
+n_val = int(len(all_tokens) * 0.002)
+np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/extra_misc_train.bin")
+np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/extra_misc_val.bin")
+PYEOF
+        log "[DONE] extra_misc"
+    fi
+fi
+# ─── Step 7: 전체 병합 ──────────────────────────────────────────────────
+if [[ $FROM_STEP -le 7 ]]; then
+    log "=== Step 7: 전체 병합 ==="
+    TRAIN_BINS=""
+    for f in \
+        "${DATA_DIR}/korean_train.bin" \
+        "${DATA_DIR}/culturax_train.bin" \
+        "${DATA_DIR}/cc100_train.bin" \
+        "${DATA_DIR}/oscar_train.bin" \
+        "${DATA_DIR}/webtext_train.bin" \
+        "${DATA_DIR}/hplt_ko_train.bin" \
+        "${DATA_DIR}/extra_misc_train.bin"; do
+        if [[ -f "$f" ]]; then
+            TRAIN_BINS="$TRAIN_BINS $f"
+            log "  Including: $f ($(du -h "$f" | cut -f1))"
+        else
+            log "  [WARN] Missing: $f"
+        fi
+    done
+    if [[ -n "$TRAIN_BINS" ]]; then
+        python3 data/merge_bins.py $TRAIN_BINS "${DATA_DIR}/merged_3b_train.bin"
+        log "[DONE] merged_3b_train.bin created"
+    fi
+    # Val 병합
+    VAL_BINS=""
+    for f in \
+        "${DATA_DIR}/korean_val.bin" \
+        "${DATA_DIR}/culturax_val.bin" \
+        "${DATA_DIR}/cc100_val.bin" \
+        "${DATA_DIR}/oscar_val.bin" \
+        "${DATA_DIR}/webtext_val.bin" \
+        "${DATA_DIR}/hplt_ko_val.bin" \
+        "${DATA_DIR}/extra_misc_val.bin"; do
+        if [[ -f "$f" ]]; then
+            VAL_BINS="$VAL_BINS $f"
+        fi
+    done
+    if [[ -n "$VAL_BINS" ]]; then
+        python3 data/merge_bins.py $VAL_BINS "${DATA_DIR}/merged_3b_val.bin"
+        log "[DONE] merged_3b_val.bin created"
+    fi
+fi
+# ─── Step 8: 검증 ────────────────────────────────────────────────────────
+if [[ $FROM_STEP -le 8 ]]; then
+    log "=== Step 8: 최종 검증 ==="
+    python3 - <<'PYEOF'
+import os, glob
+import numpy as np
+print("=== 토큰화 결과 ===")
+total_train = 0
+total_val = 0
+for f in sorted(glob.glob("data/*_train.bin") + glob.glob("data/train.bin")):
+    n = os.path.getsize(f) // 2
+    total_train += n
+    print(f"  {os.path.basename(f):30s}: {n:>15,} tokens ({os.path.getsize(f)/1e9:.2f} GB)")
+for f in sorted(glob.glob("data/*_val.bin") + glob.glob("data/val.bin")):
+    n = os.path.getsize(f) // 2
+    total_val += n
+print(f"\n  Total train: {total_train:,} tokens ({total_train/1e9:.1f}B)")
+print(f"  Total val:   {total_val:,} tokens ({total_val/1e6:.1f}M)")
+print(f"\n  3B Chinchilla minimum: 60B tokens")
+print(f"  Epochs needed for 60B: {60e9/total_train:.1f}")
+print(f"  Epochs needed for 100B: {100e9/total_train:.1f}")
+PYEOF
+fi
+log "=== 파이프라인 완료 ==="

source/scripts/prepare_sft_combined.sh ADDED Viewed

	@@ -0,0 +1,264 @@

+#!/usr/bin/env bash
+# prepare_sft_combined.sh — 3B SFT용 전체 데이터 통합
+# 모든 SFT 데이터를 하나의 train/val 파일로 합침
+#
+# 업데이트 (2026-03-02): sft_extra 신규 소스 추가
+#   - nayohan_Evol-Instruct-Code-80k-v1-ko  (코드 instruction)
+#   - FreedomIntelligence_alpaca-gpt4-korean (GPT-4 alpaca 한국어)
+#   - FreedomIntelligence_evol-instruct-korean (evol-instruct 한국어)
+#   - coastral_korean-writing-style-instruct  (한국어 글쓰기 스타일)
+#   - maywell_ko_wikidata_QA                  (위키데이터 QA)
+#   - OpenAssistant_oasst1_ko                 (OASST1 한국어, 트리 재구성)
+#   - Bllossom_evol-instruct-ko               (존재 확인 후 로드)
+set -euo pipefail
+BASE="$(cd "$(dirname "$0")/.." && pwd)"
+OUT_DIR="$BASE/data/sft_combined"
+mkdir -p "$OUT_DIR"
+python3 << 'PYEOF'
+import json, random, os, glob, hashlib
+from collections import defaultdict
+BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data"
+OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl"
+OUT_VAL = f"{BASE}/sft_combined/val.jsonl"
+VAL_RATIO = 0.02
+SEED = 42
+# SFT 소스 파일 목록 (chat 포맷으로 변환 가능한 것들)
+SOURCES = [
+    # (path, fmt)  fmt: "messages" | "auto" | "oasst"
+    (f"{BASE}/sft/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"),
+    (f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"),
+    (f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"),
+    (f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"),
+    (f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"),
+    (f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"),
+    (f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"),
+    (f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"),
+    (f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"),
+    (f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"),
+    (f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"),
+    (f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"),
+    # ---- 신규 소스 (2026-03-02) ----
+    (f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"),
+    (f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"),
+    (f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"),
+    (f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"),
+    (f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"),
+    (f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"),
+    (f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"),
+]
+def to_messages(obj):
+    """다양한 포맷을 통일된 messages 포맷으로 변환"""
+    # 이미 messages 포맷
+    if 'messages' in obj and isinstance(obj['messages'], list):
+        return obj['messages']
+    # conversations 포맷
+    if 'conversations' in obj:
+        msgs = []
+        for turn in obj['conversations']:
+            role = turn.get('from', turn.get('role', ''))
+            content = turn.get('value', turn.get('content', ''))
+            if role in ('human', 'user', 'prompter'):
+                msgs.append({'role': 'user', 'content': content})
+            elif role in ('gpt', 'assistant', 'bot'):
+                msgs.append({'role': 'assistant', 'content': content})
+        return msgs if len(msgs) >= 2 else None
+    # instruction/output 포맷
+    if 'instruction' in obj:
+        instruction = obj['instruction']
+        inp = obj.get('input', '')
+        output = obj.get('output', obj.get('response', ''))
+        if not output: return None
+        user_content = instruction + ('\n\n' + inp if inp else '')
+        return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}]
+    # question/answer 포맷
+    if 'question' in obj and 'answer' in obj:
+        return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}]
+    # prompt/response
+    if 'prompt' in obj and ('response' in obj or 'completion' in obj):
+        resp = obj.get('response', obj.get('completion', ''))
+        return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}]
+    # problem/solution
+    if 'problem' in obj and 'solution' in obj:
+        return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}]
+    return None
+def load_oasst(path):
+    """
+    OpenAssistant OASST1 flat message 포맷을 대화 트리로 재구성.
+    각 루트(prompter) 메시지에서 best-ranked assistant 응답(rank=0.0)을
+    따라 단일 대화 스레드를 추출한다.
+    deleted=True 메시지와 review_result=False 메시지는 제외.
+    """
+    nodes = {}      # message_id → obj
+    children = defaultdict(list)  # parent_id → [child_obj, ...]
+    with open(path, 'r', errors='replace') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except Exception:
+                continue
+            if obj.get('deleted', False):
+                continue
+            if obj.get('review_result') is False:
+                continue
+            mid = obj.get('message_id')
+            if mid:
+                nodes[mid] = obj
+            pid = obj.get('parent_id')
+            if pid:
+                children[pid].append(obj)
+    # 자식 목록을 rank 오름차순 정렬 (rank=null은 뒤로)
+    def sort_key(c):
+        r = c.get('rank')
+        mid = c.get('message_id', '')
+        return (1, 0, mid) if r is None else (0, r, mid)
+    for pid in children:
+        children[pid].sort(key=sort_key)
+    samples = []
+    def build_thread(node, current_msgs):
+        """재귀적으로 대화 스레드를 따라 samples에 추가."""
+        role = node.get('role', '')
+        text = node.get('text', '')
+        if role == 'prompter':
+            mapped_role = 'user'
+        elif role == 'assistant':
+            mapped_role = 'assistant'
+        else:
+            return
+        msgs = current_msgs + [{'role': mapped_role, 'content': text}]
+        # 유효한 user→assistant 쌍이 있을 때만 샘플 추가
+        if mapped_role == 'assistant' and len(msgs) >= 2:
+            samples.append({'messages': msgs})
+        # 자식 중 best (rank=0.0) 하나만 따라간다 (가장 품질 높은 경로)
+        kids = children.get(node.get('message_id'), [])
+        if kids:
+            build_thread(kids[0], msgs)
+    # 루트 노드: parent_id가 없는 prompter 메시지
+    roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter']
+    for root in roots:
+        build_thread(root, [])
+    return samples
+random.seed(SEED)
+all_samples = []
+for path, fmt in SOURCES:
+    if not os.path.exists(path):
+        print(f"[SKIP] {path}")
+        continue
+    if fmt == "oasst":
+        samples = load_oasst(path)
+        all_samples.extend(samples)
+        print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)")
+        continue
+    count = 0
+    with open(path, 'r', errors='replace') as f:
+        for line in f:
+            line = line.strip()
+            if not line: continue
+            try:
+                obj = json.loads(line)
+            except Exception:
+                continue
+            if fmt == "messages":
+                msgs = obj.get('messages') or obj.get('conversations')
+                if msgs:
+                    all_samples.append({'messages': msgs})
+                    count += 1
+            else:  # auto detect
+                msgs = to_messages(obj)
+                if msgs and len(msgs) >= 2:
+                    all_samples.append({'messages': msgs})
+                    count += 1
+    print(f"[LOADED] {os.path.basename(path)}: {count:,} samples")
+    if count == 0:
+        print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)")
+print(f"\n총 샘플: {len(all_samples):,}")
+# ---- Deduplication (MD5 of first user message) ----
+seen_hashes = set()
+unique_samples = []
+dup_count = 0
+for s in all_samples:
+    msgs = s.get('messages', [])
+    first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '')
+    h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest()
+    if h in seen_hashes:
+        dup_count += 1
+        continue
+    seen_hashes.add(h)
+    unique_samples.append(s)
+print(f"[DEDUP] 제거: {dup_count:,}, 남은 샘플: {len(unique_samples):,}")
+all_samples = unique_samples
+# ---- Format validation ----
+def validate_messages(msgs):
+    """Check messages have valid role/content structure."""
+    if not isinstance(msgs, list) or len(msgs) < 2:
+        return False
+    for m in msgs:
+        if not isinstance(m, dict):
+            return False
+        if m.get('role') not in ('user', 'assistant', 'system'):
+            return False
+        if not isinstance(m.get('content'), str):
+            return False
+    return True
+valid_samples = []
+invalid_count = 0
+for s in all_samples:
+    if validate_messages(s.get('messages', [])):
+        valid_samples.append(s)
+    else:
+        invalid_count += 1
+print(f"[VALIDATE] 유효하지 않은 포맷 제거: {invalid_count:,}, 남은 샘플: {len(valid_samples):,}")
+all_samples = valid_samples
+random.shuffle(all_samples)
+n_val = int(len(all_samples) * VAL_RATIO)
+val_samples = all_samples[:n_val]
+train_samples = all_samples[n_val:]
+os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True)
+with open(OUT_TRAIN, 'w') as f:
+    for s in train_samples:
+        f.write(json.dumps(s, ensure_ascii=False) + '\n')
+with open(OUT_VAL, 'w') as f:
+    for s in val_samples:
+        f.write(json.dumps(s, ensure_ascii=False) + '\n')
+print(f"[DONE] train: {len(train_samples):,} → {OUT_TRAIN}")
+print(f"[DONE] val:   {len(val_samples):,} → {OUT_VAL}")
+PYEOF
+echo "SFT 데이터 병합 완료"

source/scripts/quality_gate.sh ADDED Viewed

	@@ -0,0 +1,518 @@

+#!/usr/bin/env bash
+# =============================================================================
+# quality_gate.sh — Phase 완료 자동 품질 게이트 검증
+#
+# Usage:
+#   bash scripts/quality_gate.sh <phase>
+#
+# Phases:
+#   pretrain  — 사전학습 게이트 (val_loss, loss 단조 감소)
+#   sft       — SFT 게이트 (val_loss 수렴, 반복률, KoBEST)
+#   orpo      — ORPO 게이트 (반복률, KoBEST, chosen > rejected)
+#   deploy    — 배포 게이트 (GGUF perplexity, Ollama 응답)
+#   all       — 모든 게이트 순차 실행
+#
+# Exit codes:
+#   0  — 게이트 통과
+#   1  — 게이트 실패 (기준 미달)
+#   2  — 필수 파일 / 의존성 없음 (실행 불가)
+# =============================================================================
+set -uo pipefail
+PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+# ---------------------------------------------------------------------------
+# 색상 출력 헬퍼
+# ---------------------------------------------------------------------------
+_RED='\033[0;31m'
+_GREEN='\033[0;32m'
+_YELLOW='\033[1;33m'
+_BLUE='\033[0;34m'
+_NC='\033[0m'
+log_info()  { echo -e "${_BLUE}[INFO]${_NC}  $*"; }
+log_ok()    { echo -e "${_GREEN}[PASS]${_NC}  $*"; }
+log_warn()  { echo -e "${_YELLOW}[WARN]${_NC}  $*"; }
+log_fail()  { echo -e "${_RED}[FAIL]${_NC}  $*"; }
+log_skip()  { echo -e "       [SKIP]  $*"; }
+# ---------------------------------------------------------------------------
+# 유틸리티: Python 한 줄 표현식 평가 (부동소수점 비교)
+# ---------------------------------------------------------------------------
+py_eval() {
+    python3 -c "import sys; sys.exit(0 if ($1) else 1)"
+}
+py_value() {
+    python3 -c "print($1)"
+}
+# ---------------------------------------------------------------------------
+# 유틸리티: JSON에서 값 추출
+# ---------------------------------------------------------------------------
+json_get() {
+    local file="$1" key="$2"
+    python3 -c "
+import json, sys
+try:
+    d = json.load(open('$file'))
+    keys = '$key'.split('.')
+    for k in keys:
+        d = d[k]
+    print(d)
+except Exception as e:
+    print('NOT_FOUND')
+    sys.exit(1)
+"
+}
+# ---------------------------------------------------------------------------
+# 게이트 결과 집계
+# ---------------------------------------------------------------------------
+GATE_PASS=0
+GATE_FAIL=0
+GATE_SKIP=0
+record_pass() { GATE_PASS=$((GATE_PASS + 1)); log_ok "$*"; }
+record_fail() { GATE_FAIL=$((GATE_FAIL + 1)); log_fail "$*"; }
+record_skip() { GATE_SKIP=$((GATE_SKIP + 1)); log_skip "$*"; }
+# =============================================================================
+# Gate 1: Pretrain
+# =============================================================================
+gate_pretrain() {
+    echo ""
+    echo "=================================================================="
+    echo "  Gate: PRETRAIN"
+    echo "  기준: val_loss < 2.5 | loss 단조 감소 확인"
+    echo "=================================================================="
+    # 최신 체크포인트 디렉토리 탐색
+    CKPT_BASE="$PROJECT_DIR/checkpoints"
+    METRICS_FILE=""
+    # metrics.json 또는 train_log.jsonl 탐색
+    for candidate in \
+        "$CKPT_BASE/korean_3b_fp8_pretrain/metrics.json" \
+        "$CKPT_BASE/korean_3b_pretrain/metrics.json" \
+        "$PROJECT_DIR/outputs/pretrain_metrics.json" \
+        "$PROJECT_DIR/logs/pretrain_metrics.json"
+    do
+        if [[ -f "$candidate" ]]; then
+            METRICS_FILE="$candidate"
+            break
+        fi
+    done
+    if [[ -z "$METRICS_FILE" ]]; then
+        log_warn "사전학습 메트릭 파일을 찾을 수 없습니다."
+        log_warn "찾는 경로: $CKPT_BASE/korean_3b_*/metrics.json"
+        log_warn "메트릭 파일이 없으면 학습 스크립트에서 아래 형식으로 저장하세요:"
+        log_warn '  {"val_loss": 2.3, "loss_history": [3.1, 2.8, 2.5, 2.3]}'
+        record_skip "메트릭 파일 없음 — 게이트 건너뜀"
+        return 0
+    fi
+    log_info "메트릭 파일: $METRICS_FILE"
+    # val_loss 확인
+    VAL_LOSS=$(json_get "$METRICS_FILE" "val_loss" 2>/dev/null || echo "NOT_FOUND")
+    if [[ "$VAL_LOSS" == "NOT_FOUND" ]]; then
+        record_skip "val_loss 키 없음 — 건너뜀"
+    else
+        log_info "val_loss = $VAL_LOSS  (기준: < 2.5)"
+        if py_eval "$VAL_LOSS < 2.5" 2>/dev/null; then
+            record_pass "val_loss $VAL_LOSS < 2.5"
+        else
+            record_fail "val_loss $VAL_LOSS >= 2.5  (기준 미달)"
+        fi
+    fi
+    # loss 단조 감소 확인 (loss_history)
+    python3 - "$METRICS_FILE" <<'PYEOF'
+import json, sys
+metrics_file = sys.argv[1]
+try:
+    d = json.load(open(metrics_file))
+    history = d.get("loss_history", [])
+except Exception as e:
+    print(f"[SKIP] loss_history 읽기 실패: {e}")
+    sys.exit(0)
+if len(history) < 2:
+    print(f"[SKIP] loss_history 데이터 부�� ({len(history)}개)")
+    sys.exit(0)
+# 전체 추세가 감소하는지 확인 (처음 1/4 vs 마지막 1/4 평균 비교)
+n = len(history)
+q = max(1, n // 4)
+early_avg = sum(history[:q]) / q
+late_avg  = sum(history[-q:]) / q
+if late_avg < early_avg:
+    print(f"[PASS] loss 단조 감소 확인: 초기 avg={early_avg:.4f} → 최근 avg={late_avg:.4f}")
+    sys.exit(0)
+else:
+    print(f"[FAIL] loss 감소 미확인: 초기 avg={early_avg:.4f}, 최근 avg={late_avg:.4f}")
+    sys.exit(1)
+PYEOF
+    local mono_exit=$?
+    if [[ $mono_exit -eq 0 ]]; then
+        GATE_PASS=$((GATE_PASS + 1))
+    elif [[ $mono_exit -eq 1 ]]; then
+        GATE_FAIL=$((GATE_FAIL + 1))
+    fi
+    # exit 0 (SKIP) 는 이미 처리됨
+}
+# =============================================================================
+# Gate 2: SFT
+# =============================================================================
+gate_sft() {
+    echo ""
+    echo "=================================================================="
+    echo "  Gate: SFT"
+    echo "  기준: val_loss 수렴 | 반복률 < 15% | KoBEST > 55%"
+    echo "=================================================================="
+    METRICS_FILE=""
+    for candidate in \
+        "$PROJECT_DIR/outputs/sft_metrics.json" \
+        "$PROJECT_DIR/logs/sft_metrics.json" \
+        "$PROJECT_DIR/checkpoints/sft/metrics.json"
+    do
+        if [[ -f "$candidate" ]]; then
+            METRICS_FILE="$candidate"
+            break
+        fi
+    done
+    if [[ -z "$METRICS_FILE" ]]; then
+        log_warn "SFT 메트릭 파일을 찾을 수 없습니다."
+        log_warn '  {"val_loss": 1.8, "rep_rate": 0.08, "kobest_score": 0.62}'
+        record_skip "SFT 메트릭 파일 없음 — 게이트 건너뜀"
+        return 0
+    fi
+    log_info "메트릭 파일: $METRICS_FILE"
+    # val_loss 수렴 (상대 변화율 < 1% — 마지막 두 체크포인트)
+    python3 - "$METRICS_FILE" <<'PYEOF'
+import json, sys
+metrics_file = sys.argv[1]
+try:
+    d = json.load(open(metrics_file))
+    history = d.get("val_loss_history", [])
+except Exception as e:
+    print(f"[SKIP] val_loss_history 읽기 실패: {e}")
+    sys.exit(0)
+if len(history) < 2:
+    # 단일 val_loss만 있으면 단순 확인
+    val_loss = d.get("val_loss")
+    if val_loss is not None:
+        print(f"[INFO] val_loss = {val_loss} (수렴 히스토리 없음 — 단일 값 확인 건너뜀)")
+    sys.exit(0)
+last   = history[-1]
+second = history[-2]
+rel_change = abs(last - second) / max(abs(second), 1e-9)
+if rel_change < 0.01:
+    print(f"[PASS] val_loss 수렴 (상대변화율 {rel_change*100:.3f}% < 1%): {second:.4f} → {last:.4f}")
+    sys.exit(0)
+else:
+    print(f"[FAIL] val_loss 미수렴 (상대변화율 {rel_change*100:.3f}% >= 1%): {second:.4f} → {last:.4f}")
+    sys.exit(1)
+PYEOF
+    local conv_exit=$?
+    [[ $conv_exit -eq 0 ]] && GATE_PASS=$((GATE_PASS + 1)) || GATE_FAIL=$((GATE_FAIL + 1))
+    # 반복률 확인
+    REP_RATE=$(json_get "$METRICS_FILE" "rep_rate" 2>/dev/null || echo "NOT_FOUND")
+    if [[ "$REP_RATE" == "NOT_FOUND" ]]; then
+        record_skip "rep_rate 키 없음 — 건너뜀"
+    else
+        REP_PCT=$(py_value "$REP_RATE * 100")
+        log_info "반복률 = ${REP_PCT}%  (기준: < 15%)"
+        if py_eval "$REP_RATE < 0.15" 2>/dev/null; then
+            record_pass "반복률 ${REP_PCT}% < 15%"
+        else
+            record_fail "반복률 ${REP_PCT}% >= 15%  (기준 미달)"
+        fi
+    fi
+    # KoBEST 확인
+    KOBEST=$(json_get "$METRICS_FILE" "kobest_score" 2>/dev/null || echo "NOT_FOUND")
+    if [[ "$KOBEST" == "NOT_FOUND" ]]; then
+        record_skip "kobest_score 키 없음 — 건너뜀"
+    else
+        KOBEST_PCT=$(py_value "$KOBEST * 100")
+        log_info "KoBEST = ${KOBEST_PCT}%  (기준: > 55%)"
+        if py_eval "$KOBEST > 0.55" 2>/dev/null; then
+            record_pass "KoBEST ${KOBEST_PCT}% > 55%"
+        else
+            record_fail "KoBEST ${KOBEST_PCT}% <= 55%  (기준 미달)"
+        fi
+    fi
+}
+# =============================================================================
+# Gate 3: ORPO
+# =============================================================================
+gate_orpo() {
+    echo ""
+    echo "=================================================================="
+    echo "  Gate: ORPO"
+    echo "  기준: 반복률 < 5% | KoBEST > 60% | chosen > rejected 90%+"
+    echo "=================================================================="
+    METRICS_FILE=""
+    for candidate in \
+        "$PROJECT_DIR/outputs/orpo_metrics.json" \
+        "$PROJECT_DIR/logs/orpo_metrics.json" \
+        "$PROJECT_DIR/checkpoints/orpo/metrics.json"
+    do
+        if [[ -f "$candidate" ]]; then
+            METRICS_FILE="$candidate"
+            break
+        fi
+    done
+    if [[ -z "$METRICS_FILE" ]]; then
+        log_warn "ORPO 메트릭 파일을 찾을 수 없습니다."
+        log_warn '  {"rep_rate": 0.03, "kobest_score": 0.63, "chosen_win_rate": 0.92}'
+        record_skip "ORPO 메트릭 파일 없음 — 게이트 건너뜀"
+        return 0
+    fi
+    log_info "메트릭 파일: $METRICS_FILE"
+    # 반복률 (더 엄격: < 5%)
+    REP_RATE=$(json_get "$METRICS_FILE" "rep_rate" 2>/dev/null || echo "NOT_FOUND")
+    if [[ "$REP_RATE" == "NOT_FOUND" ]]; then
+        record_skip "rep_rate 키 없음 — 건너뜀"
+    else
+        REP_PCT=$(py_value "$REP_RATE * 100")
+        log_info "반복률 = ${REP_PCT}%  (기준: < 5%)"
+        if py_eval "$REP_RATE < 0.05" 2>/dev/null; then
+            record_pass "반복률 ${REP_PCT}% < 5%"
+        else
+            record_fail "반복률 ${REP_PCT}% >= 5%  (기준 미달)"
+        fi
+    fi
+    # KoBEST (더 엄격: > 60%)
+    KOBEST=$(json_get "$METRICS_FILE" "kobest_score" 2>/dev/null || echo "NOT_FOUND")
+    if [[ "$KOBEST" == "NOT_FOUND" ]]; then
+        record_skip "kobest_score 키 없음 — 건너뜀"
+    else
+        KOBEST_PCT=$(py_value "$KOBEST * 100")
+        log_info "KoBEST = ${KOBEST_PCT}%  (기준: > 60%)"
+        if py_eval "$KOBEST > 0.60" 2>/dev/null; then
+            record_pass "KoBEST ${KOBEST_PCT}% > 60%"
+        else
+            record_fail "KoBEST ${KOBEST_PCT}% <= 60%  (기준 미달)"
+        fi
+    fi
+    # Chosen win rate (chosen log-prob > rejected log-prob 비율)
+    CHOSEN_WIN=$(json_get "$METRICS_FILE" "chosen_win_rate" 2>/dev/null || echo "NOT_FOUND")
+    if [[ "$CHOSEN_WIN" == "NOT_FOUND" ]]; then
+        record_skip "chosen_win_rate 키 없음 — 건너뜀"
+    else
+        WIN_PCT=$(py_value "$CHOSEN_WIN * 100")
+        log_info "Chosen win rate = ${WIN_PCT}%  (기준: >= 90%)"
+        if py_eval "$CHOSEN_WIN >= 0.90" 2>/dev/null; then
+            record_pass "Chosen win rate ${WIN_PCT}% >= 90%"
+        else
+            record_fail "Chosen win rate ${WIN_PCT}% < 90%  (기준 미달)"
+        fi
+    fi
+}
+# =============================================================================
+# Gate 4: Deploy
+# =============================================================================
+gate_deploy() {
+    echo ""
+    echo "=================================================================="
+    echo "  Gate: DEPLOY"
+    echo "  기준: Q4_K_M perplexity < F16 × 1.05 | Ollama 5개 프롬프트 응답"
+    echo "=================================================================="
+    local MODEL_NAME="frankenstallm-3b"
+    local GGUF_DIR="$PROJECT_DIR/outputs/gguf"
+    local F16_GGUF="$GGUF_DIR/${MODEL_NAME}-f16.gguf"
+    local Q4KM_GGUF="$GGUF_DIR/${MODEL_NAME}-Q4_K_M.gguf"
+    # --- GGUF 파일 존재 확인 ---
+    if [[ ! -f "$Q4KM_GGUF" ]]; then
+        log_warn "Q4_K_M GGUF 파일 없음: $Q4KM_GGUF"
+        log_warn "먼저 실행: bash scripts/convert_3b_gguf.sh"
+        record_skip "GGUF 파일 없음 — perplexity 게이트 건너뜀"
+    else
+        # perplexity 측정 (llama-perplexity 또는 Python fallback)
+        LLAMA_PPL_BIN="$PROJECT_DIR/outputs/llama.cpp/build/bin/llama-perplexity"
+        if [[ ! -f "$LLAMA_PPL_BIN" ]]; then
+            log_warn "llama-perplexity 바이너리 없음 — 빌드 시도 중 ..."
+            cmake --build "$PROJECT_DIR/outputs/llama.cpp/build" \
+                --target llama-perplexity -j "$(nproc)" &>/dev/null || true
+        fi
+        # 샘플 텍스트로 perplexity 비교
+        SAMPLE_TEXT="$PROJECT_DIR/outputs/gguf/ppl_sample.txt"
+        if [[ ! -f "$SAMPLE_TEXT" ]]; then
+            # 짧은 한국어 샘플 생성
+            cat > "$SAMPLE_TEXT" <<'SAMPLE'
+인공지능은 현대 사회에서 매우 중요한 기술로 자리잡고 있습니다.
+기계 학습과 딥러닝의 발전으로 인해 다양한 분야에서 혁신이 이루어지고 있습니다.
+자연어 처리 기술의 발전은 인간과 컴퓨터의 상호작용 방식을 근본적으로 변화시키고 있습니다.
+한국어는 교착어로서 특유의 형태론적 특성을 가지고 있어 자연어 처리에 독특한 도전을 제시합니다.
+대규모 언어 모델의 등장으로 기계 번역, 텍스트 요약, 질의응답 등의 성능이 크게 향상되었습니다.
+SAMPLE
+        fi
+        if [[ -f "$LLAMA_PPL_BIN" && -f "$F16_GGUF" ]]; then
+            log_info "Perplexity 측정 중 (F16 vs Q4_K_M) ..."
+            PPL_F16=$(timeout 120 "$LLAMA_PPL_BIN" -m "$F16_GGUF" -f "$SAMPLE_TEXT" 2>&1 \
+                | grep -oP "Perplexity: \K[0-9.]+" | head -1 || echo "0")
+            PPL_Q4=$(timeout 120 "$LLAMA_PPL_BIN" -m "$Q4KM_GGUF" -f "$SAMPLE_TEXT" 2>&1 \
+                | grep -oP "Perplexity: \K[0-9.]+" | head -1 || echo "0")
+            if [[ "$PPL_F16" == "0" || "$PPL_Q4" == "0" ]]; then
+                record_skip "Perplexity 측정 실패 — 건너뜀"
+            else
+                THRESHOLD=$(py_value "$PPL_F16 * 1.05")
+                log_info "F16 PPL = $PPL_F16  |  Q4_K_M PPL = $PPL_Q4  |  기준: < $THRESHOLD"
+                if py_eval "$PPL_Q4 < $PPL_F16 * 1.05" 2>/dev/null; then
+                    record_pass "Q4_K_M PPL $PPL_Q4 < F16 PPL × 1.05 ($THRESHOLD)"
+                else
+                    record_fail "Q4_K_M PPL $PPL_Q4 >= F16 PPL × 1.05 ($THRESHOLD)"
+                fi
+            fi
+        else
+            record_skip "llama-perplexity 또는 F16 GGUF 없음 — perplexity 게이트 건너뜀"
+        fi
+    fi
+    # --- Ollama 응답 테스트 ---
+    if ! command -v ollama &>/dev/null; then
+        record_skip "ollama 없음 — 응답 테스트 건너뜀"
+        return 0
+    fi
+    if ! ollama list 2>/dev/null | grep -q "$MODEL_NAME"; then
+        log_warn "Ollama에 $MODEL_NAME 모델이 등록되지 않았습니다."
+        log_warn "먼저 실행: bash scripts/deploy_3b_ollama.sh"
+        record_skip "Ollama 모델 미등록 — 응답 테스트 건너뜀"
+        return 0
+    fi
+    log_info "Ollama 응답 테스트 (5개 프롬프트) ..."
+    declare -a PROMPTS=(
+        "안녕하세요."
+        "1 더하기 1은 무엇인가요?"
+        "파이썬이란 무엇인가요?"
+        "한국의 수도는 어디인가요?"
+        "오늘 날씨가 좋네요."
+    )
+    local PASS=0 FAIL=0
+    for i in "${!PROMPTS[@]}"; do
+        local PROMPT="${PROMPTS[$i]}"
+        local NUM=$((i + 1))
+        if RESP=$(timeout 45 ollama run "$MODEL_NAME" "$PROMPT" 2>&1) && [[ -n "$RESP" ]]; then
+            log_ok "  프롬프트 $NUM 응답 OK (${#RESP}자)"
+            PASS=$((PASS + 1))
+        else
+            log_fail "  프롬프트 $NUM 응답 실패"
+            FAIL=$((FAIL + 1))
+        fi
+    done
+    log_info "Ollama 응답: $PASS/5 성공"
+    if [[ $FAIL -eq 0 ]]; then
+        record_pass "Ollama 5개 프롬프트 모두 응답 성공"
+    else
+        record_fail "Ollama 응답 실패 $FAIL/5"
+    fi
+}
+# =============================================================================
+# 최종 요약 출력
+# =============================================================================
+print_summary() {
+    local phase="$1"
+    local TOTAL=$((GATE_PASS + GATE_FAIL + GATE_SKIP))
+    echo ""
+    echo "=================================================================="
+    echo "  Quality Gate 결과: $phase"
+    echo "  PASS: $GATE_PASS  |  FAIL: $GATE_FAIL  |  SKIP: $GATE_SKIP  |  TOTAL: $TOTAL"
+    echo "=================================================================="
+    if [[ $GATE_FAIL -eq 0 ]]; then
+        echo -e "${_GREEN}  [GATE PASSED]${_NC} 모든 검증 기준 통과"
+        echo ""
+        return 0
+    else
+        echo -e "${_RED}  [GATE FAILED]${_NC} ${GATE_FAIL}개 검증 기준 미달"
+        echo "  실패 항목을 수정한 후 다시 실행하세요."
+        echo ""
+        return 1
+    fi
+}
+# =============================================================================
+# 진입점
+# =============================================================================
+PHASE="${1:-}"
+if [[ -z "$PHASE" ]]; then
+    echo "Usage: bash scripts/quality_gate.sh <phase>"
+    echo "  phase: pretrain | sft | orpo | deploy | all"
+    exit 2
+fi
+echo ""
+echo "=================================================================="
+echo "  Quality Gate 검증 시작: $PHASE"
+echo "  프로젝트: $PROJECT_DIR"
+echo "  시각    : $(date '+%Y-%m-%d %H:%M:%S')"
+echo "=================================================================="
+case "$PHASE" in
+    pretrain)
+        gate_pretrain
+        print_summary "pretrain"
+        ;;
+    sft)
+        gate_sft
+        print_summary "sft"
+        ;;
+    orpo)
+        gate_orpo
+        print_summary "orpo"
+        ;;
+    deploy)
+        gate_deploy
+        print_summary "deploy"
+        ;;
+    all)
+        gate_pretrain
+        gate_sft
+        gate_orpo
+        gate_deploy
+        print_summary "all"
+        ;;
+    *)
+        echo "ERROR: 알 수 없는 phase: $PHASE"
+        echo "Usage: bash scripts/quality_gate.sh <pretrain|sft|orpo|deploy|all>"
+        exit 2
+        ;;
+esac

source/scripts/run_eval.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/env bash
+# Usage: bash scripts/run_eval.sh <checkpoint_dir>
+# Example: bash scripts/run_eval.sh checkpoints/korean_1b_fp8_run1/checkpoint-0200000
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+CHECKPOINT="${1:?Usage: bash scripts/run_eval.sh <checkpoint_dir>}"
+echo "=== Perplexity Evaluation ==="
+python "$PROJECT_DIR/eval/perplexity.py" \
+  --checkpoint "$CHECKPOINT" \
+  --data "$PROJECT_DIR/data/korean_val.bin" \
+  --device cuda:0
+echo ""
+echo "=== Text Generation ==="
+python "$PROJECT_DIR/eval/generate.py" \
+  --checkpoint "$CHECKPOINT" \
+  --prompt "안녕하세요, 저는" \
+  --max_new_tokens 200 \
+  --device cuda:0

source/scripts/run_eval_full.sh ADDED Viewed

	@@ -0,0 +1,236 @@

+#!/usr/bin/env bash
+# ============================================================
+# run_eval_full.sh — 전체 한국어 벤치마크 평가 (목표: 1.5-3시간)
+#
+# 사용법:
+#   bash scripts/run_eval_full.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
+#
+# 예시:
+#   bash scripts/run_eval_full.sh \
+#       checkpoints/korean_1b_sft/checkpoint-0005000 \
+#       eval/outputs/full_5000
+#
+# 태스크:
+#   - KoBEST (5): boolq, copa, hellaswag, sentineg, wic
+#   - HAE-RAE Bench (5): general_knowledge, history, loan_word, rare_word, standard_nomenclature
+#   - Global MMLU Korean: 57개 도메인
+#   - PAWS-Ko: 패러프레이즈 탐지
+#   - KorMedMCQA: 한국어 의학 MCQ (선택)
+#
+# 총 예상 샘플: ~15,000개
+# 1B 모델 @ 8×B200 기준: 약 1.5-3시간
+# ============================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+# ─── 인자 처리 ────────────────────────────────────────────
+CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
+TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
+OUTPUT_DIR="${2:-eval/outputs/full_${TIMESTAMP}}"
+[[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
+[[ "$OUTPUT_DIR" != /* ]]  && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
+# ─── 설정 ────────────────────────────────────────────────
+HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
+TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
+# GPU 설정: 단일 GPU 또는 tensor parallel
+# lm-eval의 hf backend는 기본 단일 GPU 사용
+# 멀티 GPU: --model_args "pretrained=...,parallelize=True" (자동 device_map)
+USE_MULTI_GPU="${USE_MULTI_GPU:-0}"
+if [ "$USE_MULTI_GPU" = "1" ]; then
+    MODEL_EXTRA_ARGS=",parallelize=True"
+    echo "▶ 멀티 GPU 모드 활성화 (device_map=auto)"
+else
+    MODEL_EXTRA_ARGS=""
+    CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+fi
+BATCH_SIZE="${BATCH_SIZE:-auto}"
+NUM_FEWSHOT="${NUM_FEWSHOT:-0}"
+# ─── 태스크 정의 ─────────────────────────────────────────
+# Core Korean tasks (항상 실행)
+TASKS_CORE="kobest,haerae,paws_ko"
+# Extended tasks (시간 있을 때)
+TASKS_EXTENDED="global_mmlu_ko"
+# 선택적 태스크
+TASKS_OPTIONAL="kormedmcqa"    # 한국어 의학 MCQ
+# 전체 실행 태스크
+TASKS="${TASKS_CORE},${TASKS_EXTENDED}"
+# ─── 의존성 확인 ─────────────────────────────────────────
+check_dep() {
+    python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
+}
+check_dep lm_eval lm-eval
+check_dep transformers transformers
+check_dep safetensors safetensors
+echo "=================================================="
+echo " Ko-LLM Full Benchmark Evaluation"
+echo "=================================================="
+echo " Checkpoint  : $CHECKPOINT"
+echo " HF output   : $HF_MODEL_DIR"
+echo " Tasks       : $TASKS"
+echo " Few-shot    : $NUM_FEWSHOT"
+echo " Batch size  : $BATCH_SIZE"
+echo " Output      : $OUTPUT_DIR"
+echo " Multi-GPU   : $USE_MULTI_GPU"
+echo " Start time  : $(date)"
+echo "=================================================="
+mkdir -p "$OUTPUT_DIR"
+LOG_FILE="$OUTPUT_DIR/eval_full.log"
+# ─── Step 1: HF 포맷 변환 ───────────────────────────────
+echo ""
+echo "▶ [1/3] 커스텀 체크포인트 → HF 포맷 변환..."
+if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
+    python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
+        --checkpoint "$CHECKPOINT" \
+        --output "$HF_MODEL_DIR" \
+        --tokenizer "$TOKENIZER" \
+        2>&1 | tee -a "$LOG_FILE"
+    echo "✅ HF 변환 완료: $HF_MODEL_DIR"
+else
+    echo "  ↳ HF 모델 이미 존재, 변환 스킵: $HF_MODEL_DIR"
+fi
+# ─── Step 2: 전체 평가 ──────────────────────────────────
+echo ""
+echo "▶ [2/3] lm-eval 전체 평가 시작..."
+echo "  ↳ 로그: $LOG_FILE"
+START_TIME=$(date +%s)
+if [ "$USE_MULTI_GPU" = "1" ]; then
+    python3 -m lm_eval \
+        --model hf \
+        --model_args "pretrained=$HF_MODEL_DIR,dtype=float16,parallelize=True" \
+        --tasks "$TASKS" \
+        --num_fewshot "$NUM_FEWSHOT" \
+        --batch_size "$BATCH_SIZE" \
+        --output_path "$OUTPUT_DIR" \
+        --log_samples \
+        --verbosity INFO \
+        2>&1 | tee -a "$LOG_FILE"
+else
+    CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" python3 -m lm_eval \
+        --model hf \
+        --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
+        --tasks "$TASKS" \
+        --num_fewshot "$NUM_FEWSHOT" \
+        --batch_size "$BATCH_SIZE" \
+        --output_path "$OUTPUT_DIR" \
+        --log_samples \
+        --verbosity INFO \
+        2>&1 | tee -a "$LOG_FILE"
+fi
+END_TIME=$(date +%s)
+ELAPSED=$(( END_TIME - START_TIME ))
+echo ""
+echo "✅ 평가 완료! 소요: $((ELAPSED/60))분 $((ELAPSED%60))초"
+# ─── Step 3: 결과 요약 리포트 생성 ─────────────────────
+echo ""
+echo "▶ [3/3] 결과 리포트 생성..."
+python3 - "$OUTPUT_DIR" "$CHECKPOINT" <<'PYEOF'
+import json, glob, sys, os
+from datetime import datetime
+output_dir = sys.argv[1]
+checkpoint = sys.argv[2] if len(sys.argv) > 2 else "unknown"
+results_files = sorted(glob.glob(f"{output_dir}/**/*.json", recursive=True))
+results_files = [f for f in results_files if "samples_" not in os.path.basename(f)]
+report_lines = [
+    f"# Ko-LLM Full Eval Report",
+    f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+    f"Checkpoint: {checkpoint}",
+    "",
+]
+all_results = {}
+for rf in results_files:
+    try:
+        with open(rf) as f:
+            data = json.load(f)
+        results = data.get("results", {})
+        if results:
+            all_results.update(results)
+    except Exception:
+        pass
+# KoBEST 요약
+kobest_tasks = [k for k in all_results if k.startswith("kobest_")]
+if kobest_tasks:
+    report_lines.append("## KoBEST")
+    report_lines.append("| Task | Metric | Score |")
+    report_lines.append("|------|--------|-------|")
+    for task in sorted(kobest_tasks):
+        metrics = all_results[task]
+        for key, val in metrics.items():
+            if "stderr" not in key and isinstance(val, (int, float)):
+                report_lines.append(f"| {task} | {key} | {val:.4f} |")
+# HAE-RAE 요약
+haerae_tasks = [k for k in all_results if k.startswith("haerae")]
+if haerae_tasks:
+    report_lines.append("\n## HAE-RAE Bench")
+    report_lines.append("| Task | Metric | Score |")
+    report_lines.append("|------|--------|-------|")
+    for task in sorted(haerae_tasks):
+        metrics = all_results[task]
+        for key, val in metrics.items():
+            if "stderr" not in key and isinstance(val, (int, float)):
+                report_lines.append(f"| {task} | {key} | {val:.4f} |")
+# MMLU Ko 요약 (상위 레벨만)
+mmlu_top = {k: v for k, v in all_results.items()
+            if k.startswith("global_mmlu_ko") and "_" not in k.replace("global_mmlu_ko", "")}
+if mmlu_top:
+    report_lines.append("\n## Global MMLU (Korean)")
+    for task, metrics in mmlu_top.items():
+        for key, val in metrics.items():
+            if "stderr" not in key and isinstance(val, (int, float)):
+                report_lines.append(f"- {task} {key}: {val:.4f}")
+# 기타
+other_tasks = [k for k in all_results
+               if not k.startswith("kobest_")
+               and not k.startswith("haerae")
+               and not k.startswith("global_mmlu_ko")]
+if other_tasks:
+    report_lines.append("\n## 기타 태스크")
+    for task in sorted(other_tasks):
+        metrics = all_results[task]
+        for key, val in metrics.items():
+            if "stderr" not in key and isinstance(val, (int, float)):
+                report_lines.append(f"- {task} | {key}: {val:.4f}")
+report_path = os.path.join(output_dir, "SUMMARY.md")
+with open(report_path, "w") as f:
+    f.write("\n".join(report_lines))
+print("\n".join(report_lines))
+print(f"\n📄 리포트 저장: {report_path}")
+PYEOF
+echo ""
+echo "=================================================="
+echo "✅ 전체 평가 완료!"
+echo " 결과 디렉토리: $OUTPUT_DIR"
+echo " 요약 리포트  : $OUTPUT_DIR/SUMMARY.md"
+echo " 전체 로그    : $LOG_FILE"
+echo " 완료 시각    : $(date)"
+echo "=================================================="

source/scripts/run_eval_quick.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+#!/usr/bin/env bash
+# ============================================================
+# run_eval_quick.sh — 빠른 평가 체크 (목표: 20-30분)
+#
+# 사용법:
+#   bash scripts/run_eval_quick.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
+#
+# 예시:
+#   bash scripts/run_eval_quick.sh \
+#       checkpoints/korean_1b_sft/checkpoint-0005000 \
+#       eval/outputs/quick_5000
+#
+# 태스크: kobest_boolq, kobest_copa, haerae_general_knowledge,
+#         haerae_history, paws_ko
+# ============================================================
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+# ─── 인자 처리 ────────────────────────────────────────────
+CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
+TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
+OUTPUT_DIR="${2:-eval/outputs/quick_${TIMESTAMP}}"
+# 상대 경로 → 절대 경로
+[[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
+[[ "$OUTPUT_DIR" != /* ]]  && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
+# ─── 설정 ────────────────────────────────────────────────
+HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
+TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
+DEVICE="${CUDA_VISIBLE_DEVICES:-0}"   # 기본: GPU 0번만 사용
+BATCH_SIZE="auto"
+# 빠른 체크 태스크 (약 2,000 샘플, ~20분)
+TASKS="kobest_boolq,kobest_copa,haerae_general_knowledge,haerae_history,paws_ko"
+# ─── 의존성 확인 ─────────────────────────────────────────
+check_dep() {
+    python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
+}
+check_dep lm_eval lm-eval
+check_dep transformers transformers
+check_dep safetensors safetensors
+echo "=================================================="
+echo " Ko-LLM Quick Eval"
+echo "=================================================="
+echo " Checkpoint : $CHECKPOINT"
+echo " HF output  : $HF_MODEL_DIR"
+echo " Tasks      : $TASKS"
+echo " Output     : $OUTPUT_DIR"
+echo " Device     : cuda:$DEVICE"
+echo "=================================================="
+mkdir -p "$OUTPUT_DIR"
+# ─── Step 1: HF 포맷 변환 ───────────────────────────────
+if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
+    echo ""
+    echo "▶ Step 1: 커스텀 체크포인트 → HF 포맷 변환..."
+    python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
+        --checkpoint "$CHECKPOINT" \
+        --output "$HF_MODEL_DIR" \
+        --tokenizer "$TOKENIZER"
+    echo "✅ HF 변환 완료: $HF_MODEL_DIR"
+else
+    echo "▶ Step 1: HF 모델 이미 존재, 변환 스킵"
+    echo "   $HF_MODEL_DIR"
+fi
+# ─── Step 2: lm-eval 실행 ───────────────────────────────
+echo ""
+echo "▶ Step 2: lm-eval 평가 시작..."
+START_TIME=$(date +%s)
+CUDA_VISIBLE_DEVICES="$DEVICE" python3 -m lm_eval \
+    --model hf \
+    --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
+    --tasks "$TASKS" \
+    --num_fewshot 0 \
+    --batch_size "$BATCH_SIZE" \
+    --output_path "$OUTPUT_DIR" \
+    --log_samples \
+    --verbosity INFO \
+    2>&1 | tee "$OUTPUT_DIR/eval.log"
+END_TIME=$(date +%s)
+ELAPSED=$(( END_TIME - START_TIME ))
+echo ""
+echo "=================================================="
+echo "✅ 평가 완료!"
+echo " 소요시간: $((ELAPSED / 60))분 $((ELAPSED % 60))초"
+echo " 결과 저장: $OUTPUT_DIR"
+echo "=================================================="
+# ─── Step 3: 결과 요약 출력 ─────────────────────────────
+echo ""
+echo "▶ Step 3: 결과 요약"
+python3 - <<'PYEOF'
+import json, glob, sys, os
+output_dir = sys.argv[1] if len(sys.argv) > 1 else "."
+results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True)
+results_files = [f for f in results_files if "results" in f.lower()]
+if not results_files:
+    print("결과 JSON 파일 없음. eval.log 확인하세요.")
+    sys.exit(0)
+for rf in results_files:
+    try:
+        with open(rf) as f:
+            data = json.load(f)
+        results = data.get("results", {})
+        print(f"\n{'='*50}")
+        print(f"Task Results (from {os.path.basename(rf)})")
+        print(f"{'='*50}")
+        for task, metrics in results.items():
+            print(f"\n{task}:")
+            for key, val in metrics.items():
+                if "stderr" not in key and isinstance(val, (int, float)):
+                    print(f"  {key}: {val:.4f}")
+    except Exception as e:
+        print(f"파싱 실패: {rf}: {e}")
+PYEOF
+python3 - "$OUTPUT_DIR" <<'PYEOF'
+import json, glob, sys, os
+output_dir = sys.argv[1] if len(sys.argv) > 1 else "."
+results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True)
+results_files = [f for f in results_files if "results" in os.path.basename(f)]
+if not results_files:
+    # try finding any json
+    results_files = glob.glob(f"{output_dir}/*.json")
+for rf in results_files[:3]:
+    try:
+        with open(rf) as f:
+            data = json.load(f)
+        results = data.get("results", {})
+        print(f"\n{'='*50}\nTask Results: {os.path.basename(rf)}\n{'='*50}")
+        for task, metrics in results.items():
+            print(f"\n{task}:")
+            for key, val in metrics.items():
+                if "stderr" not in key and isinstance(val, (int, float)):
+                    print(f"  {key}: {val:.4f}")
+    except Exception as e:
+        print(f"파싱 실패: {rf}: {e}")
+PYEOF

source/scripts/run_pretrain.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+# Usage: bash scripts/run_pretrain.sh [additional torchrun args]
+# Runs 8-GPU DDP pretraining via torchrun.
+#
+# Any extra arguments are forwarded verbatim to pretrain.py.
+# Examples:
+#   bash scripts/run_pretrain.sh --max_steps 200000
+#   bash scripts/run_pretrain.sh --resume checkpoints/checkpoint-0010000
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+torchrun \
+  --nproc_per_node=8 \
+  --master_port=29500 \
+  "$PROJECT_DIR/train/pretrain.py" \
+  --config "$PROJECT_DIR/configs/small.yaml" \
+  --train_data "$PROJECT_DIR/data/train.bin" \
+  --val_data "$PROJECT_DIR/data/val.bin" \
+  --checkpoint_dir "$PROJECT_DIR/checkpoints" \
+  --batch_size 8 \
+  --grad_accum 4 \
+  --warmup_steps 2000 \
+  "$@"

source/scripts/start-gateway.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env bash
+# start-gateway.sh — OpenClaw 게이트웨이 직접 시작 (독립 프로세스)
+set -euo pipefail
+RNTIER_HOME="${RNTIER_HOME:-$HOME}"
+export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
+export HOME="${HOME:-/home/ghong}"
+export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
+export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
+LOG_DIR="/tmp/openclaw"
+GATEWAY_LOG="${LOG_DIR}/gateway.log"
+PID_FILE="/tmp/openclaw-gateway.pid"
+mkdir -p "$LOG_DIR"
+# 기존 프로세스 정리
+pkill -f "openclaw.*gateway" 2>/dev/null || true
+sleep 2
+# 게이트웨이 시작 — setsid로 완전 분리
+setsid nohup "${RNTIER_HOME}/.npm-global/bin/openclaw" gateway run \
+    --port 18789 \
+    --bind loopback \
+    >> "$GATEWAY_LOG" 2>&1 < /dev/null &
+PID=$!
+echo "$PID" > "$PID_FILE"
+date +%s > /tmp/openclaw-last-restart
+echo "[$(date)] Gateway launched with PID $PID"
+# 10초 대기 후 상태 확인
+sleep 10
+if kill -0 "$PID" 2>/dev/null; then
+    echo "[$(date)] OK: Gateway PID $PID is alive"
+    ss -tlnH "sport = :18789" 2>/dev/null && echo "[$(date)] OK: Port 18789 is listening" || echo "[$(date)] WARN: Port 18789 not yet listening"
+else
+    echo "[$(date)] FAIL: Gateway PID $PID died"
+    echo "--- Last 20 lines of gateway.log ---"
+    tail -20 "$GATEWAY_LOG" 2>/dev/null
+    exit 1
+fi

source/scripts/telegram_notify.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+"""
+Standalone Telegram notification helper for FRANKENSTALLM 3B training.
+Usage:
+    python3 scripts/telegram_notify.py "Your message here"
+    python3 scripts/telegram_notify.py "<b>Bold</b> message" --parse-mode HTML
+Function API:
+    from scripts.telegram_notify import send_telegram
+    send_telegram("message text")
+"""
+import os
+import sys
+import json
+import urllib.request
+import urllib.parse
+import urllib.error
+import logging
+from typing import Optional
+# ─── Configuration ────────────────────────────────────────────────────────────
+BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
+CHAT_ID   = os.environ.get("TELEGRAM_CHAT_ID", "")
+TIMEOUT   = 15  # seconds
+MAX_MSG_LEN = 4096  # Telegram limit
+logging.basicConfig(
+    level=logging.WARNING,
+    format="%(asctime)s [telegram_notify] %(levelname)s: %(message)s",
+)
+log = logging.getLogger("telegram_notify")
+def send_telegram(
+    message: str,
+    parse_mode: str = "HTML",
+    token: str = BOT_TOKEN,
+    chat_id: str = CHAT_ID,
+    disable_web_page_preview: bool = True,
+) -> bool:
+    """
+    Send a Telegram message via Bot API using urllib (curl-free).
+    Args:
+        message:  Text to send (HTML or Markdown depending on parse_mode).
+        parse_mode: "HTML" or "Markdown" or "" (plain).
+        token:    Bot token (defaults to module-level BOT_TOKEN).
+        chat_id:  Recipient chat/channel ID.
+        disable_web_page_preview: Suppress link previews.
+    Returns:
+        True on success, False on any error.
+    """
+    if not message:
+        log.warning("Empty message — skipping send.")
+        return False
+    # Truncate if over Telegram limit, with notice
+    if len(message) > MAX_MSG_LEN:
+        truncated_notice = "\n\n<i>[message truncated]</i>" if parse_mode == "HTML" else "\n\n[message truncated]"
+        message = message[: MAX_MSG_LEN - len(truncated_notice)] + truncated_notice
+    url = f"https://api.telegram.org/bot{token}/sendMessage"
+    payload: dict = {
+        "chat_id": chat_id,
+        "text": message,
+        "disable_web_page_preview": disable_web_page_preview,
+    }
+    if parse_mode:
+        payload["parse_mode"] = parse_mode
+    data = urllib.parse.urlencode(payload).encode("utf-8")
+    try:
+        req = urllib.request.Request(
+            url,
+            data=data,
+            method="POST",
+            headers={"Content-Type": "application/x-www-form-urlencoded"},
+        )
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            body = resp.read().decode("utf-8")
+            result = json.loads(body)
+            if result.get("ok"):
+                return True
+            else:
+                log.error("Telegram API error: %s", result.get("description", result))
+                return False
+    except urllib.error.HTTPError as e:
+        try:
+            err_body = e.read().decode("utf-8")
+        except Exception:
+            err_body = str(e)
+        log.error("HTTP %d from Telegram: %s", e.code, err_body)
+        return False
+    except urllib.error.URLError as e:
+        log.error("Network error sending Telegram message: %s", e.reason)
+        return False
+    except json.JSONDecodeError as e:
+        log.error("Failed to parse Telegram response: %s", e)
+        return False
+    except Exception as e:  # noqa: BLE001
+        log.error("Unexpected error in send_telegram: %s", e)
+        return False
+def send_telegram_safe(message: str, **kwargs) -> bool:
+    """
+    Wrapper that catches ALL exceptions — guaranteed never to crash the caller.
+    Suitable for embedding in training loops where stability is critical.
+    """
+    try:
+        return send_telegram(message, **kwargs)
+    except Exception as e:  # noqa: BLE001
+        log.error("send_telegram_safe caught unhandled exception: %s", e)
+        return False
+# ─── CLI entry point ──────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Send a Telegram message from the command line."
+    )
+    parser.add_argument("message", nargs="?", help="Message text to send")
+    parser.add_argument(
+        "--parse-mode",
+        default="HTML",
+        choices=["HTML", "Markdown", "MarkdownV2", ""],
+        help="Telegram parse_mode (default: HTML)",
+    )
+    parser.add_argument(
+        "--token", default=BOT_TOKEN, help="Override bot token"
+    )
+    parser.add_argument(
+        "--chat-id", default=CHAT_ID, help="Override chat ID"
+    )
+    args = parser.parse_args()
+    # Allow piped stdin if no positional arg given
+    if args.message is None:
+        if not sys.stdin.isatty():
+            args.message = sys.stdin.read().strip()
+        else:
+            parser.print_help()
+            sys.exit(1)
+    ok = send_telegram(
+        args.message,
+        parse_mode=args.parse_mode,
+        token=args.token,
+        chat_id=args.chat_id,
+    )
+    if ok:
+        print("Telegram message sent successfully.")
+        sys.exit(0)
+    else:
+        print("ERROR: Failed to send Telegram message.", file=sys.stderr)
+        sys.exit(1)

source/scripts/test_ollama_repetition.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env python3
+"""
+test_ollama_repetition.py — Ollama 배포 모델 반복률 검증
+ORPO eval과 동일한 프롬프트로 Ollama API 호출 후 n-gram 반복률 + EOS 종료율 측정.
+목표: 3-gram rep < 3% (한국어 자연 반복 고려), EOS 종료율 > 95%
+Usage:
+    python scripts/test_ollama_repetition.py [--model frankenstallm-3b] [--host localhost:11434]
+"""
+import argparse
+import json
+import urllib.request
+import urllib.error
+import sys
+from collections import Counter
+# ORPO eval에서 사용한 15개 한국어 프롬프트
+TEST_PROMPTS = [
+    "대한민국의 수도는 어디인가요?",
+    "인공지능이란 무엇인가요?",
+    "한국의 전통 음식 중에서 김치에 대해 설명해주세요.",
+    "프로그래밍을 배우려면 어떻게 해야 하나요?",
+    "지구 온난화의 원인과 대책에 대해 설명해주세요.",
+    "한국어의 특징을 3가지 설명해주세요.",
+    "좋은 리더의 자질에 대해 논해주세요.",
+    "우주 탐사의 의미와 중요성을 설명해주세요.",
+    "건강한 생활 습관 5가지를 추천해주세요.",
+    "인터넷이 현대 사회에 미친 영향을 분석해주세요.",
+    "한국의 교육 제도의 장단점을 설명해주세요.",
+    "환경 보호를 위해 개인이 할 수 있는 일을 알려주세요.",
+    "4차 산업혁명이 일자리에 미치는 영향을 분석해주세요.",
+    "독서의 중요성과 효과적인 독서 방법을 알려주세요.",
+    "한국 문화의 세계화에 대해 어떻게 생각하시나요?",
+]
+def compute_ngram_repetition(text: str, n: int) -> float:
+    """n-gram 반복률 계산 (0.0 ~ 1.0)"""
+    tokens = text.split()
+    if len(tokens) < n:
+        return 0.0
+    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
+    if not ngrams:
+        return 0.0
+    counts = Counter(ngrams)
+    repeated = sum(c - 1 for c in counts.values() if c > 1)
+    return repeated / len(ngrams)
+def call_ollama(prompt: str, model: str, host: str, timeout: int = 120) -> dict:
+    """Ollama API 호출"""
+    url = f"http://{host}/api/generate"
+    payload = json.dumps({
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+    }).encode("utf-8")
+    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read().decode("utf-8"))
+    except urllib.error.URLError as e:
+        return {"error": str(e), "response": ""}
+    except Exception as e:
+        return {"error": str(e), "response": ""}
+def main():
+    parser = argparse.ArgumentParser(description="Ollama 반복률 검증")
+    parser.add_argument("--model", default="frankenstallm-3b", help="Ollama 모델 이름")
+    parser.add_argument("--host", default="localhost:11434", help="Ollama 서버 주소")
+    args = parser.parse_args()
+    print(f"{'='*70}")
+    print(f"  Ollama 반복률 검증: {args.model}")
+    print(f"  서버: {args.host}")
+    print(f"  프롬프트: {len(TEST_PROMPTS)}개")
+    print(f"{'='*70}\n")
+    results = []
+    for i, prompt in enumerate(TEST_PROMPTS, 1):
+        print(f"[{i:2d}/{len(TEST_PROMPTS)}] {prompt[:40]}...")
+        resp = call_ollama(prompt, args.model, args.host)
+        if "error" in resp and resp["error"]:
+            print(f"  ERROR: {resp['error']}")
+            results.append({"prompt": prompt, "error": resp["error"]})
+            continue
+        text = resp.get("response", "")
+        eos_done = resp.get("done", False)
+        rep1 = compute_ngram_repetition(text, 1)
+        rep2 = compute_ngram_repetition(text, 2)
+        rep3 = compute_ngram_repetition(text, 3)
+        rep4 = compute_ngram_repetition(text, 4)
+        results.append({
+            "prompt": prompt,
+            "response_len": len(text),
+            "word_count": len(text.split()),
+            "eos_done": eos_done,
+            "rep1": rep1, "rep2": rep2, "rep3": rep3, "rep4": rep4,
+        })
+        preview = text[:100].replace("\n", " ")
+        print(f"  응답: {preview}...")
+        print(f"  길이: {len(text)}자, EOS: {eos_done}, "
+              f"rep(1/2/3/4): {rep1:.2%}/{rep2:.2%}/{rep3:.2%}/{rep4:.2%}")
+        print()
+    # --- Summary ---
+    valid = [r for r in results if "error" not in r or not r.get("error")]
+    if not valid:
+        print("ERROR: 유효한 응답 없음")
+        sys.exit(1)
+    avg_rep3 = sum(r["rep3"] for r in valid) / len(valid)
+    eos_rate = sum(1 for r in valid if r["eos_done"]) / len(valid)
+    errors = len(results) - len(valid)
+    print(f"{'='*70}")
+    print(f"  결과 요약")
+    print(f"{'='*70}")
+    print(f"  유효 응답: {len(valid)}/{len(results)}  (에러: {errors})")
+    print(f"  평균 3-gram 반복률: {avg_rep3:.2%}  (목표: < 3%)")
+    print(f"  EOS 종료율:        {eos_rate:.0%}  (목표: > 95%)")
+    print()
+    # Pass/Fail
+    # 한국어는 조사/접속사 자연 반복으로 어절 기준 3-gram rep 1.5~2%가 자연 floor
+    # 퇴행적 반복(30%+)과 구별하여 3% 기준 적용
+    rep_pass = avg_rep3 < 0.03
+    eos_pass = eos_rate > 0.95
+    overall = rep_pass and eos_pass
+    print(f"  3-gram 반복률: {'PASS ✓' if rep_pass else 'FAIL ✗'}  ({avg_rep3:.2%})")
+    print(f"  EOS 종료율:    {'PASS ✓' if eos_pass else 'FAIL ✗'}  ({eos_rate:.0%})")
+    print(f"  종합:          {'PASS ✓' if overall else 'FAIL ✗'}")
+    print(f"{'='*70}")
+    sys.exit(0 if overall else 1)
+if __name__ == "__main__":
+    main()

source/scripts/training_watchdog.sh ADDED Viewed

	@@ -0,0 +1,292 @@

+#!/usr/bin/env bash
+# =============================================================================
+# training_watchdog.sh — FRANKENSTALLM 3B Cron-based Training Watchdog
+# Run: every 10 minutes via cron
+# Alerts via Telegram only when problems are detected.
+# =============================================================================
+set -euo pipefail
+# ─── Paths ───────────────────────────────────────────────────────────────────
+WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
+CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
+LOG_FILE="$CKPT_DIR/train.log"
+PID_FILE="$CKPT_DIR/train.pid"
+WATCHDOG_LOG="$CKPT_DIR/watchdog.log"
+STATE_FILE="$CKPT_DIR/watchdog.state"   # persists last-good step/time
+NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"
+# ─── Thresholds ──────────────────────────────────────────────────────────────
+LOSS_SPIKE_THRESHOLD="5.0"       # alert if loss > this value
+LOSS_NAN_PATTERN="nan|inf|NaN|Inf"
+STALL_SECONDS=900                # 15 min without new log line → stalled
+DISK_WARN_PCT=85                 # alert if disk usage >= this %
+GPU_UTIL_WARN_PCT=20             # alert if avg GPU util drops below this %
+MIN_TOKPS=5000                   # alert if tok/s drops below this
+TOTAL_STEPS=57000
+WAIT_COUNT_FILE="/tmp/frankenstallm-wait-count"  # 대기 횟수 파일
+MAX_WAIT_COUNT=10                                  # 이 횟수 초과 시 알림 후 cron 해제
+# ─── Helpers ─────────────────────────────────────────────────────────────────
+ts() { date '+%Y-%m-%d %H:%M:%S'; }
+log_msg() {
+    echo "[$(ts)] $*"
+}
+send_alert() {
+    local level="$1"
+    local msg="$2"
+    log_msg "ALERT[$level]: $msg"
+    $NOTIFY "<b>[FRANKENSTALLM ALERT] $level</b>
+$msg
+<i>$(ts) | watchdog check</i>" || true
+}
+# ─── 1. Process alive check ──────────────────────────────────────────────────
+check_process() {
+    if [[ ! -f "$PID_FILE" ]]; then
+        # 대기 모드: PID 파일 없으면 학습 미시작 상태로 카운트
+        local wait_count=0
+        [[ -f "$WAIT_COUNT_FILE" ]] && wait_count=$(cat "$WAIT_COUNT_FILE" 2>/dev/null || echo 0)
+        wait_count=$(( wait_count + 1 ))
+        echo "$wait_count" > "$WAIT_COUNT_FILE"
+        log_msg "Training not started yet (waiting ${wait_count}/${MAX_WAIT_COUNT})."
+        if (( wait_count > MAX_WAIT_COUNT )); then
+            send_alert "WAIT_TIMEOUT" "학습이 <b>${wait_count}회</b> 체크 동안 시작되지 않았습니다 (~$((wait_count * 10))분).
+PID 파일 없음: <code>$PID_FILE</code>
+Watchdog cron을 자동 해제합니다. 학습 시작 후 직접 재등록하세요:
+<code>crontab -e</code>"
+            # cron에서 training_watchdog 제거
+            crontab -l 2>/dev/null | grep -v "training_watchdog" | crontab -
+            rm -f "$WAIT_COUNT_FILE"
+            log_msg "Watchdog cron entry removed after ${wait_count} waits."
+        fi
+        return 1
+    fi
+    # 학습 시작됨 → 대기 카운터 초기화
+    rm -f "$WAIT_COUNT_FILE"
+    local pid
+    pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
+    if [[ -z "$pid" ]]; then
+        send_alert "PROCESS" "PID file is empty: $PID_FILE"
+        return 1
+    fi
+    if ! kill -0 "$pid" 2>/dev/null; then
+        # Check if it completed normally (step == TOTAL_STEPS)
+        local last_step
+        last_step=$(grep -oP 'step\s+\K[0-9]+' "$LOG_FILE" 2>/dev/null | tail -1)
+        if [[ "$last_step" == "$TOTAL_STEPS" ]]; then
+            log_msg "Training COMPLETED at step $TOTAL_STEPS — process exit is expected."
+            send_alert "COMPLETE" "Training completed normally at step <code>$TOTAL_STEPS/$TOTAL_STEPS</code>."
+        else
+            send_alert "CRASH" "Training process (PID $pid) is NOT running.
+Last logged step: <code>${last_step:-unknown}</code>/$TOTAL_STEPS
+Check log: <code>tail -50 $LOG_FILE</code>"
+        fi
+        return 1
+    fi
+    log_msg "Process PID $pid is alive."
+    return 0
+}
+# ─── 2. Stall detection ──────────────────────────────────────────────────────
+check_stall() {
+    if [[ ! -f "$LOG_FILE" ]]; then
+        send_alert "STALL" "Log file not found: $LOG_FILE"
+        return 1
+    fi
+    local log_mtime now elapsed
+    log_mtime=$(stat -c '%Y' "$LOG_FILE" 2>/dev/null || echo 0)
+    now=$(date +%s)
+    elapsed=$(( now - log_mtime ))
+    if (( elapsed >= STALL_SECONDS )); then
+        local mins=$(( elapsed / 60 ))
+        send_alert "STALL" "No log activity for <b>${mins} minutes</b> (threshold: $(( STALL_SECONDS/60 ))min).
+Log last modified: <code>$(date -d "@$log_mtime" '+%Y-%m-%d %H:%M:%S')</code>
+Training may be hung or extremely slow."
+        return 1
+    fi
+    log_msg "Log freshness OK: last update ${elapsed}s ago."
+    return 0
+}
+# ─── 3. Loss anomaly check ───────────────────────────────────────────────────
+check_loss() {
+    if [[ ! -f "$LOG_FILE" ]]; then
+        return 0
+    fi
+    # Get last step line
+    local last_line
+    last_line=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1)
+    if [[ -z "$last_line" ]]; then
+        log_msg "No step lines found in log yet."
+        return 0
+    fi
+    local loss step
+    loss=$(echo "$last_line" | grep -oP 'loss\s+\K[0-9.eE+\-naifNIF]+' || echo "")
+    step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0")
+    if [[ -z "$loss" ]]; then
+        log_msg "Could not parse loss from: $last_line"
+        return 0
+    fi
+    # NaN/Inf check
+    if echo "$loss" | grep -qiE "$LOSS_NAN_PATTERN"; then
+        send_alert "LOSS_NAN" "Loss is <b>$loss</b> at step <code>$step</code>.
+Training has diverged — NaN/Inf detected.
+Last log line:
+<code>${last_line}</code>"
+        return 1
+    fi
+    # Spike check (only after warmup, step > 500)
+    if (( step > 500 )); then
+        local loss_int
+        loss_int=$(echo "$loss >= $LOSS_SPIKE_THRESHOLD" | bc -l 2>/dev/null || echo 0)
+        if [[ "$loss_int" == "1" ]]; then
+            send_alert "LOSS_SPIKE" "Loss spike detected: <b>$loss</b> at step <code>$step</code> (threshold: $LOSS_SPIKE_THRESHOLD).
+Last log line:
+<code>${last_line}</code>"
+            return 1
+        fi
+    fi
+    log_msg "Loss OK: $loss at step $step."
+    return 0
+}
+# ─── 4. Throughput check ─────────────────────────────────────────────────────
+check_throughput() {
+    if [[ ! -f "$LOG_FILE" ]]; then
+        return 0
+    fi
+    local last_line
+    last_line=$(grep -E 'step\s+[0-9]+.*tok/s' "$LOG_FILE" 2>/dev/null | tail -1)
+    [[ -z "$last_line" ]] && return 0
+    # tok/s may be formatted with commas: 36,321
+    local tokps step
+    tokps=$(echo "$last_line" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "")
+    step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0")
+    if [[ -z "$tokps" ]]; then
+        log_msg "Could not parse tok/s from last log line."
+        return 0
+    fi
+    if (( step > 100 && tokps < MIN_TOKPS )); then
+        send_alert "THROUGHPUT" "Throughput dropped to <b>${tokps} tok/s</b> at step <code>$step</code> (min: ${MIN_TOKPS}).
+GPU may be throttling, NCCL stalled, or a data worker is slow."
+        return 1
+    fi
+    log_msg "Throughput OK: ${tokps} tok/s at step $step."
+    return 0
+}
+# ─── 5. GPU utilization check ────────────────────────────────────────────────
+check_gpu() {
+    if ! command -v nvidia-smi &>/dev/null; then
+        log_msg "nvidia-smi not available — skipping GPU check."
+        return 0
+    fi
+    local avg_util
+    avg_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null \
+        | awk '{sum+=$1; count++} END {if(count>0) printf "%.0f", sum/count; else print 0}')
+    if [[ -z "$avg_util" || "$avg_util" == "0" ]]; then
+        log_msg "GPU util query returned 0 or empty — possibly all idle."
+        # Only alert if process is also running
+        local pid
+        pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
+        if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
+            send_alert "GPU_IDLE" "All 8× B200 GPUs show <b>0% utilization</b> while training process is alive.
+Possible NCCL hang or data pipeline stall."
+            return 1
+        fi
+        return 0
+    fi
+    if (( avg_util < GPU_UTIL_WARN_PCT )); then
+        local gpu_details
+        gpu_details=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
+            --format=csv,noheader 2>/dev/null | head -8 || echo "unavailable")
+        send_alert "GPU_LOW" "Average GPU utilization: <b>${avg_util}%</b> (threshold: ${GPU_UTIL_WARN_PCT}%).
+GPU details:
+<code>${gpu_details}</code>"
+        return 1
+    fi
+    log_msg "GPU utilization OK: ${avg_util}% average."
+    return 0
+}
+# ─── 6. Disk space check ─────────────────────────────────────────────────────
+check_disk() {
+    local usage_pct
+    usage_pct=$(df "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}')
+    if [[ -z "$usage_pct" ]]; then
+        log_msg "Could not determine disk usage for $CKPT_DIR."
+        return 0
+    fi
+    if (( usage_pct >= DISK_WARN_PCT )); then
+        local avail
+        avail=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')
+        send_alert "DISK" "Disk usage at <b>${usage_pct}%</b> (threshold: ${DISK_WARN_PCT}%).
+Available: <b>${avail}</b> on partition containing checkpoints.
+Risk: checkpoint saves may fail. Consider deleting old checkpoints."
+        return 1
+    fi
+    log_msg "Disk usage OK: ${usage_pct}% used."
+    return 0
+}
+# ─── Main ────────────────────────────────────────────────────────────────────
+main() {
+    log_msg "=== Watchdog check START ==="
+    local issues=0
+    check_process  || (( issues++ )) || true
+    check_stall    || (( issues++ )) || true
+    check_loss     || (( issues++ )) || true
+    check_throughput || (( issues++ )) || true
+    check_gpu      || (( issues++ )) || true
+    check_disk     || (( issues++ )) || true
+    if (( issues == 0 )); then
+        log_msg "All checks passed — no alerts sent."
+    else
+        log_msg "Watchdog found $issues issue(s) — alerts sent."
+    fi
+    log_msg "=== Watchdog check END ==="
+}
+main "$@"

source/scripts/upload_to_huggingface.py ADDED Viewed

	@@ -0,0 +1,182 @@

+#!/usr/bin/env python3
+"""Upload FRANKENSTALLM: model, eval reports, source code, and data scripts to Hugging Face.
+Usage:
+    huggingface-cli login
+    # 모델 + README + 평가 결과 + 보고서
+    python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --create-pr
+    # 위 + 소스 코드 + 데이터 스크립트 (모델/데이터/소스 전부)
+    python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --with-source --with-data --create-pr
+    # 평가·보고서만
+    python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --readme-only --create-pr
+"""
+import argparse
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+HF_CHECKPOINT = PROJECT_ROOT / "outputs" / "hf_checkpoint-best-fixed"
+REPORTS_DIR = PROJECT_ROOT / "reports"
+EVAL_RESULTS_DIR = PROJECT_ROOT / "eval" / "results" / "frankenstallm-3b-v2"
+DATA_DIR = PROJECT_ROOT / "data"
+SOURCE_DIRS = ["train", "model", "configs", "scripts", "tokenizer", "eval"]
+def main():
+    parser = argparse.ArgumentParser(description="Upload model, eval reports, source, and data scripts to Hugging Face")
+    parser.add_argument("--repo-id", type=str, required=True, help="e.g. pathcosmos/frankenstallm")
+    parser.add_argument("--readme-only", action="store_true", help="Only push README + eval results + reports (no model)")
+    parser.add_argument("--create-pr", action="store_true", help="Create a Pull Request instead of pushing to main")
+    parser.add_argument("--with-source", action="store_true", help="Upload full source code (train, eval, model, configs, scripts, tokenizer)")
+    parser.add_argument("--with-data", action="store_true", help="Upload data scripts and DATA_README (no .bin files)")
+    args = parser.parse_args()
+    create_pr = getattr(args, "create_pr", False)
+    try:
+        from huggingface_hub import HfApi, create_repo
+    except ImportError:
+        print("Install: pip install huggingface_hub")
+        raise SystemExit(1)
+    api = HfApi()
+    # 레포 없으면 생성
+    # 레포가 없으면 생성 (본인 계정일 때만 성공)
+    try:
+        create_repo(args.repo_id, repo_type="model", exist_ok=True)
+    except Exception as e:
+        print(f"Note: create_repo skipped (use Hugging Face website to create repo if needed): {e}")
+    if not args.readme_only:
+        if not HF_CHECKPOINT.exists():
+            print(f"Checkpoint not found: {HF_CHECKPOINT}")
+            raise SystemExit(1)
+        print(f"Uploading model from {HF_CHECKPOINT} ...")
+        api.upload_folder(
+            folder_path=str(HF_CHECKPOINT),
+            repo_id=args.repo_id,
+            repo_type="model",
+            create_pr=create_pr,
+        )
+        print("Model upload done.")
+    # README는 체크포인트 폴더 것 사용 (이미 평가 요약 포함)
+    readme_src = HF_CHECKPOINT / "README.md"
+    if readme_src.exists():
+        print("Pushing README (model card) ...")
+        api.upload_file(
+            path_or_fileobj=str(readme_src),
+            path_in_repo="README.md",
+            repo_id=args.repo_id,
+            repo_type="model",
+            create_pr=create_pr,
+        )
+        print("README upload done.")
+    else:
+        print("No README.md in checkpoint dir; skipping README push.")
+    # 평가 결과 JSON
+    results_json = EVAL_RESULTS_DIR / "ollama_benchmark_results.json"
+    if results_json.exists():
+        print("Pushing ollama_benchmark_results.json ...")
+        api.upload_file(
+            path_or_fileobj=str(results_json),
+            path_in_repo="eval/ollama_benchmark_results.json",
+            repo_id=args.repo_id,
+            repo_type="model",
+            create_pr=create_pr,
+        )
+        print("Eval results upload done.")
+    # 배포·평가 보고서 (상세 기록)
+    for name, src in [
+        ("2026-03-09_GGUF_DEPLOYMENT_AND_EVAL_REPORT.md", REPORTS_DIR / "2026-03-09_GGUF_DEPLOYMENT_AND_EVAL_REPORT.md"),
+        ("2026-03-09_ORPO_EVALUATION_REPORT.md", REPORTS_DIR / "2026-03-09_ORPO_EVALUATION_REPORT.md"),
+    ]:
+        if src.exists():
+            print(f"Pushing {name} ...")
+            api.upload_file(
+                path_or_fileobj=str(src),
+                path_in_repo=f"eval_reports/{name}",
+                repo_id=args.repo_id,
+                repo_type="model",
+                create_pr=create_pr,
+            )
+    print("Reports upload done.")
+    # ---------- 소스 코드 (--with-source) ----------
+    if getattr(args, "with_source", False):
+        print("Uploading source code ...")
+        ignore_common = ["**/__pycache__/**", "**/*.pyc", "**/.DS_Store"]
+        for dirname in ["train", "model", "configs", "scripts", "tokenizer"]:
+            src_dir = PROJECT_ROOT / dirname
+            if src_dir.exists():
+                api.upload_folder(
+                    folder_path=str(src_dir),
+                    path_in_repo=f"source/{dirname}",
+                    repo_id=args.repo_id,
+                    repo_type="model",
+                    ignore_patterns=ignore_common,
+                    create_pr=create_pr,
+                )
+                print(f"  source/{dirname}/ done.")
+        # eval: outputs, results 제외 (대용량)
+        eval_dir = PROJECT_ROOT / "eval"
+        if eval_dir.exists():
+            api.upload_folder(
+                folder_path=str(eval_dir),
+                path_in_repo="source/eval",
+                repo_id=args.repo_id,
+                repo_type="model",
+                ignore_patterns=ignore_common + ["**/outputs/**", "**/results/**", "**/.compile_cache/**"],
+                create_pr=create_pr,
+            )
+            print("  source/eval/ done.")
+        # 루트 문서
+        for name in ["README.md", "CLAUDE.md", "requirements.txt", "PROGRESS.md"]:
+            src_file = PROJECT_ROOT / name
+            if src_file.exists():
+                api.upload_file(
+                    path_or_fileobj=str(src_file),
+                    path_in_repo=f"source/{name}",
+                    repo_id=args.repo_id,
+                    repo_type="model",
+                    create_pr=create_pr,
+                )
+        for p in PROJECT_ROOT.glob("PLAN_*.md"):
+            api.upload_file(
+                path_or_fileobj=str(p),
+                path_in_repo=f"source/{p.name}",
+                repo_id=args.repo_id,
+                repo_type="model",
+                create_pr=create_pr,
+            )
+        print("Source upload done.")
+    # ---------- 데이터 스크립트 (--with-data, .bin 제외) ----------
+    if getattr(args, "with_data", False) and DATA_DIR.exists():
+        print("Uploading data scripts (no .bin) ...")
+        api.upload_folder(
+            folder_path=str(DATA_DIR),
+            path_in_repo="data",
+            repo_id=args.repo_id,
+            repo_type="model",
+            ignore_patterns=[
+                "**/*.bin",
+                "**/*.chunk*",
+                "**/__pycache__/**",
+                "**/code/**",
+                "**/*.pyc",
+            ],
+            create_pr=create_pr,
+        )
+        print("Data scripts upload done.")
+    print(f"Done. https://huggingface.co/{args.repo_id}")
+if __name__ == "__main__":
+    main()