Upload folder using huggingface_hub

#17
Files changed (39) hide show
  1. source/scripts/RESTART_GUIDE.md +23 -0
  2. source/scripts/apply_optimizations.sh +194 -0
  3. source/scripts/build_3b_dataset.sh +83 -0
  4. source/scripts/check_korean_data.sh +178 -0
  5. source/scripts/clickhouse-watchdog.sh +201 -0
  6. source/scripts/convert_3b_gguf.sh +229 -0
  7. source/scripts/convert_to_gguf.sh +92 -0
  8. source/scripts/convert_to_hf.py +262 -0
  9. source/scripts/deploy_3b_ollama.sh +146 -0
  10. source/scripts/deploy_ollama.sh +118 -0
  11. source/scripts/fix_tokenizer_byte_fallback.py +235 -0
  12. source/scripts/hourly_status.sh +241 -0
  13. source/scripts/launch_3b_orpo.sh +177 -0
  14. source/scripts/launch_3b_pretrain.sh +258 -0
  15. source/scripts/launch_3b_sft.sh +145 -0
  16. source/scripts/launch_3b_sft_v2.sh +156 -0
  17. source/scripts/launch_fp8.sh +94 -0
  18. source/scripts/launch_hybrid_3b.sh +62 -0
  19. source/scripts/launch_korean_1b.sh +133 -0
  20. source/scripts/launch_korean_3b.sh +115 -0
  21. source/scripts/launch_sft.sh +111 -0
  22. source/scripts/migrate_qkv_checkpoint.py +230 -0
  23. source/scripts/monitor_3b.sh +316 -0
  24. source/scripts/monitor_training.sh +244 -0
  25. source/scripts/openclaw-watchdog.sh +243 -0
  26. source/scripts/orpo_eval_watchdog.sh +127 -0
  27. source/scripts/orpo_hp_sweep.sh +166 -0
  28. source/scripts/prepare_3b_data.sh +414 -0
  29. source/scripts/prepare_sft_combined.sh +264 -0
  30. source/scripts/quality_gate.sh +518 -0
  31. source/scripts/run_eval.sh +23 -0
  32. source/scripts/run_eval_full.sh +236 -0
  33. source/scripts/run_eval_quick.sh +150 -0
  34. source/scripts/run_pretrain.sh +26 -0
  35. source/scripts/start-gateway.sh +44 -0
  36. source/scripts/telegram_notify.py +168 -0
  37. source/scripts/test_ollama_repetition.py +148 -0
  38. source/scripts/training_watchdog.sh +292 -0
  39. source/scripts/upload_to_huggingface.py +182 -0
source/scripts/RESTART_GUIDE.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FRANKENSTALLM 3B — Optimization Restart Guide
2
+
3
+ ## Quick restart (all optimizations applied automatically):
4
+ ```bash
5
+ bash scripts/apply_optimizations.sh
6
+ ```
7
+
8
+ ## Validate only (no restart):
9
+ ```bash
10
+ bash scripts/apply_optimizations.sh --test-only
11
+ ```
12
+
13
+ ## Manual steps if auto-migration fails:
14
+ 1. Stop: `kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)`
15
+ 2. Migrate: `python3 scripts/migrate_qkv_checkpoint.py checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX`
16
+ 3. Restart: `bash scripts/launch_3b_pretrain.sh`
17
+
18
+ ## Rollback (undo QKV fusion):
19
+ ```bash
20
+ CKPT=checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
21
+ cp ${CKPT}/model.pt.backup_pre_qkv ${CKPT}/model.pt
22
+ git checkout model/attention.py # restore original attention code
23
+ ```
source/scripts/apply_optimizations.sh ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # apply_optimizations.sh — Apply v2 optimizations and restart training
4
+ #
5
+ # Optimizations applied:
6
+ # 1. QKV Projection Fusion (+8-12% throughput)
7
+ # 2. NUMA CPU Affinity (fix 69% cross-NUMA workers)
8
+ # 3. Batch size 4→5 (11h saved over full run)
9
+ # 4. NCCL NVLS algorithm + 256MB buffers
10
+ # 5. DDP bucket_cap_mb 400→800
11
+ # 6. DataLoader num_workers 4→6, prefetch_factor 3→4
12
+ # 7. MADV_RANDOM + WILLNEED for PackedDataset
13
+ # 8. numactl --interleave=all on torchrun
14
+ #
15
+ # Usage:
16
+ # bash scripts/apply_optimizations.sh # full migration
17
+ # bash scripts/apply_optimizations.sh --test-only # just validate, don't restart
18
+ # bash scripts/apply_optimizations.sh --skip-stop # don't stop current training
19
+ # =============================================================================
20
+ set -u
21
+
22
+ cd "$(dirname "$0")/.."
23
+
24
+ RUN_NAME="korean_3b_fp8_run1"
25
+ CKPT_DIR="checkpoints/${RUN_NAME}"
26
+ PID_FILE="${CKPT_DIR}/train.pid"
27
+ LOG_FILE="${CKPT_DIR}/train.log"
28
+
29
+ TEST_ONLY=false
30
+ SKIP_STOP=false
31
+ for arg in "$@"; do
32
+ case "$arg" in
33
+ --test-only) TEST_ONLY=true ;;
34
+ --skip-stop) SKIP_STOP=true ;;
35
+ esac
36
+ done
37
+
38
+ echo "=================================================================="
39
+ echo " FRANKENSTALLM 3B — Optimization Migration v2"
40
+ echo " $(date)"
41
+ echo "=================================================================="
42
+
43
+ # ---- Step 1: Validate all modified files --------------------------------
44
+ echo ""
45
+ echo "[1/6] Validating modified files..."
46
+ ERRORS=0
47
+
48
+ for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do
49
+ if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then
50
+ echo " ✓ $pyfile — syntax OK"
51
+ else
52
+ echo " ✗ $pyfile — SYNTAX ERROR"
53
+ ((ERRORS++))
54
+ fi
55
+ done
56
+
57
+ if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then
58
+ echo " ✓ scripts/launch_3b_pretrain.sh — syntax OK"
59
+ else
60
+ echo " ✗ scripts/launch_3b_pretrain.sh — SYNTAX ERROR"
61
+ ((ERRORS++))
62
+ fi
63
+
64
+ # Check YAML
65
+ python3 -c "
66
+ import yaml
67
+ with open('configs/korean_3b_fp8.yaml') as f:
68
+ cfg = yaml.safe_load(f)
69
+ assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}'
70
+ print(' ✓ configs/korean_3b_fp8.yaml — valid, batch_size=5')
71
+ " 2>/dev/null || { echo " ✗ configs/korean_3b_fp8.yaml — INVALID"; ((ERRORS++)); }
72
+
73
+ if [[ $ERRORS -gt 0 ]]; then
74
+ echo ""
75
+ echo "[ERROR] $ERRORS file(s) failed validation. Aborting."
76
+ exit 1
77
+ fi
78
+ echo " All files validated successfully."
79
+
80
+ if $TEST_ONLY; then
81
+ echo ""
82
+ echo "[INFO] --test-only mode. Exiting without restart."
83
+ exit 0
84
+ fi
85
+
86
+ # ---- Step 2: Stop current training (graceful) ---------------------------
87
+ if ! $SKIP_STOP; then
88
+ echo ""
89
+ echo "[2/6] Stopping current training (SIGTERM → emergency checkpoint)..."
90
+ if [[ -f "$PID_FILE" ]]; then
91
+ PID=$(cat "$PID_FILE")
92
+ if kill -0 "$PID" 2>/dev/null; then
93
+ echo " Sending SIGTERM to PID $PID..."
94
+ kill "$PID"
95
+ echo " Waiting for graceful shutdown (up to 120s)..."
96
+ for i in $(seq 1 120); do
97
+ if ! kill -0 "$PID" 2>/dev/null; then
98
+ echo " Process stopped after ${i}s"
99
+ break
100
+ fi
101
+ sleep 1
102
+ done
103
+ if kill -0 "$PID" 2>/dev/null; then
104
+ echo " [WARN] Process still running after 120s. Force killing..."
105
+ kill -9 "$PID" 2>/dev/null || true
106
+ sleep 2
107
+ fi
108
+ else
109
+ echo " Process $PID not running."
110
+ fi
111
+ else
112
+ echo " No PID file found."
113
+ fi
114
+
115
+ # Wait for all GPU processes to clear
116
+ echo " Waiting for GPU processes to terminate..."
117
+ for i in $(seq 1 30); do
118
+ if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then
119
+ echo " All GPU processes cleared."
120
+ break
121
+ fi
122
+ sleep 1
123
+ done
124
+ fi
125
+
126
+ # ---- Step 3: Find and migrate latest checkpoint -------------------------
127
+ echo ""
128
+ echo "[3/6] Migrating latest checkpoint (QKV fusion)..."
129
+ LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1)
130
+ if [[ -z "$LATEST_CKPT" ]]; then
131
+ echo " [ERROR] No checkpoint found!"
132
+ exit 1
133
+ fi
134
+ echo " Latest checkpoint: $LATEST_CKPT"
135
+
136
+ # Backup original model.pt
137
+ cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv"
138
+ echo " Backup created: model.pt.backup_pre_qkv"
139
+
140
+ # Run migration
141
+ python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT"
142
+ echo " QKV fusion migration complete."
143
+
144
+ # ---- Step 4: Quick validation test (5 steps) ----------------------------
145
+ echo ""
146
+ echo "[4/6] Running 5-step validation test..."
147
+ # Use single GPU for fast test
148
+ timeout 120 python3 train/pretrain.py \
149
+ --config configs/korean_3b_fp8.yaml \
150
+ --train_data data/3b_train.bin \
151
+ --checkpoint_dir /tmp/frankenstallm_test \
152
+ --max_steps 5 \
153
+ --batch_size 5 \
154
+ --resume "$LATEST_CKPT" \
155
+ 2>&1 | tail -10
156
+
157
+ TEST_EXIT=$?
158
+ if [[ $TEST_EXIT -eq 0 ]]; then
159
+ echo " ✓ 5-step test passed!"
160
+ else
161
+ echo " ✗ 5-step test FAILED (exit code $TEST_EXIT)"
162
+ echo " [WARN] Restoring original checkpoint..."
163
+ cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt"
164
+ echo " Original checkpoint restored. Aborting."
165
+ exit 1
166
+ fi
167
+
168
+ # ---- Step 5: Clean up test artifacts ------------------------------------
169
+ echo ""
170
+ echo "[5/6] Cleaning up test artifacts..."
171
+ rm -rf /tmp/frankenstallm_test
172
+
173
+ # ---- Step 6: Launch full training with optimizations --------------------
174
+ echo ""
175
+ echo "[6/6] Launching optimized training..."
176
+ echo ""
177
+ echo " Changes applied:"
178
+ echo " • QKV Projection Fusion (single GEMM)"
179
+ echo " • NUMA CPU Affinity (cores 0-35→GPU0-3, 36-71→GPU4-7)"
180
+ echo " • Batch size: 4 → 5"
181
+ echo " • NCCL: NVLS,Ring algorithm, 256MB buffers"
182
+ echo " • DDP: bucket_cap_mb 400 → 800"
183
+ echo " • DataLoader: 4→6 workers, prefetch 3→4"
184
+ echo " • MADV_RANDOM + WILLNEED for dataset mmap"
185
+ echo " • numactl --interleave=all on torchrun"
186
+ echo ""
187
+
188
+ bash scripts/launch_3b_pretrain.sh
189
+
190
+ echo ""
191
+ echo "=================================================================="
192
+ echo " Migration complete! Monitor with:"
193
+ echo " tail -f ${LOG_FILE}"
194
+ echo "=================================================================="
source/scripts/build_3b_dataset.sh ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ cd "$(dirname "$0")/.."
4
+ DATA="data"
5
+
6
+ echo "=================================================================="
7
+ echo " 3B 통합 데이터셋 빌드 | 시작: $(date)"
8
+ echo "=================================================================="
9
+
10
+ # 청크 병합 함수
11
+ merge_chunks() {
12
+ PREFIX="$1"
13
+ OUTPUT="$2"
14
+ CHUNKS=$(ls "${PREFIX}".bin.chunk* 2>/dev/null | sort || true)
15
+ if [[ -z "$CHUNKS" ]]; then return; fi
16
+ if [[ -f "$OUTPUT" ]]; then echo " [SKIP] $OUTPUT 이미 존재"; return; fi
17
+ echo " 청크 병합: $(basename $PREFIX)"
18
+ cat $CHUNKS > "$OUTPUT"
19
+ echo " 완료: $(du -sh $OUTPUT | cut -f1)"
20
+ }
21
+
22
+ merge_chunks "$DATA/cosmo_auto_math_text_train" "$DATA/cosmo_auto_math_text_train.bin"
23
+ merge_chunks "$DATA/cosmo_auto_math_text_val" "$DATA/cosmo_auto_math_text_val.bin"
24
+ merge_chunks "$DATA/cosmo_web_v2_train" "$DATA/cosmo_web_v2_train.bin"
25
+ merge_chunks "$DATA/cosmo_web_v2_val" "$DATA/cosmo_web_v2_val.bin"
26
+
27
+ TRAIN_FILES=""
28
+ for f in \
29
+ "$DATA/korean_train.bin" \
30
+ "$DATA/hplt_ko_train.bin" \
31
+ "$DATA/korean_c4_train.bin" \
32
+ "$DATA/cc100_ko_train.bin" \
33
+ "$DATA/namuwiki_2023b_train.bin" \
34
+ "$DATA/korean_namuwiki_train.bin" \
35
+ "$DATA/wikipedia_ko_train.bin" \
36
+ "$DATA/korean_wiki_train.bin" \
37
+ "$DATA/open_web_math_train.bin" \
38
+ "$DATA/mathpile_train.bin" \
39
+ "$DATA/cosmo_auto_math_text_train.bin" \
40
+ "$DATA/cosmo_stories_train.bin" \
41
+ "$DATA/cosmo_web_v2_train.bin" \
42
+ "$DATA/cosmo_stanford_train.bin" \
43
+ "$DATA/cosmo_wikihow_train.bin" \
44
+ "$DATA/cosmo_openstax_train.bin" \
45
+ "$DATA/cosmo_khanacademy_train.bin"; do
46
+ [[ -f "$f" ]] && TRAIN_FILES="$TRAIN_FILES $f"
47
+ done
48
+
49
+ VAL_FILES=""
50
+ for f in \
51
+ "$DATA/korean_val.bin" \
52
+ "$DATA/hplt_ko_val.bin" \
53
+ "$DATA/korean_c4_val.bin" \
54
+ "$DATA/cc100_ko_val.bin" \
55
+ "$DATA/namuwiki_2023b_val.bin" \
56
+ "$DATA/open_web_math_val.bin" \
57
+ "$DATA/mathpile_val.bin" \
58
+ "$DATA/cosmo_auto_math_text_val.bin" \
59
+ "$DATA/cosmo_stories_val.bin" \
60
+ "$DATA/cosmo_web_v2_val.bin"; do
61
+ [[ -f "$f" ]] && VAL_FILES="$VAL_FILES $f"
62
+ done
63
+
64
+ echo ""
65
+ echo "train 파일 병합 → data/3b_train.bin ..."
66
+ python3 data/merge_bins.py $TRAIN_FILES data/3b_train.bin
67
+
68
+ echo ""
69
+ echo "val 파일 병합 → data/3b_val.bin ..."
70
+ python3 data/merge_bins.py $VAL_FILES data/3b_val.bin
71
+
72
+ echo ""
73
+ echo "=================================================================="
74
+ du -sh data/3b_train.bin data/3b_val.bin
75
+ python3 -c "
76
+ import os
77
+ sz = os.path.getsize('data/3b_train.bin')
78
+ tok = sz // 2
79
+ print(f'3b_train: {tok/1e9:.2f}B tokens')
80
+ print(f'60B 달성 에포크: {60/(tok/1e9):.1f}x 반복 필요')
81
+ "
82
+ echo "완료: $(date)"
83
+ echo "=================================================================="
source/scripts/check_korean_data.sh ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 한국어 학습 데이터 현황 확인 스크립트
4
+ # 용도: 한국어 데이터셋 상태, 토크나이저, 원본 데이터 파일 확인
5
+
6
+ set -e
7
+
8
+ # 프로젝트 루트 (이 스크립트 실행 위치 기준)
9
+ PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
10
+ cd "${PROJECT_ROOT}"
11
+
12
+ echo "=== 한국어 학습 데이터 현황 ==="
13
+ echo ""
14
+
15
+ # ============================================================================
16
+ # 1. 학습용 바이너리 데이터 확인
17
+ # ============================================================================
18
+ echo "[ 학습 바이너리 데이터 ]"
19
+
20
+ check_binary_data() {
21
+ local file=$1
22
+ local name=$2
23
+
24
+ if [ -f "$file" ]; then
25
+ local size=$(du -h "$file" | cut -f1)
26
+
27
+ # Python + numpy memmap으로 토큰 수 계산
28
+ # 바이너리는 uint32 형태로 저장되어 있음 (4 bytes per token)
29
+ local token_count=$(python3 -c "
30
+ import numpy as np
31
+ try:
32
+ data = np.memmap('$file', dtype=np.uint32, mode='r')
33
+ print(len(data))
34
+ except Exception as e:
35
+ print('error')
36
+ " 2>/dev/null || echo "error")
37
+
38
+ if [ "$token_count" != "error" ] && [ ! -z "$token_count" ]; then
39
+ # 토큰 수를 포맷팅 (천 단위 쉼표)
40
+ local formatted_tokens=$(printf "%'d" "$token_count")
41
+
42
+ # 1B 모델 학습 스텝 계산
43
+ # tokens_per_step = batch_size * grad_accum * seq_len * num_gpus
44
+ # = 8 * 4 * 4096 * 8 = 1,048,576 tokens/step
45
+ local tokens_per_step=1048576
46
+ local estimated_steps=$((token_count / tokens_per_step))
47
+
48
+ printf " %-20s : 존재 (%s, %'d 토큰, ~%'d steps)\n" \
49
+ "$name" "$size" "$token_count" "$estimated_steps"
50
+ else
51
+ printf " %-20s : 존재 (%s, 토큰 계산 실패)\n" "$name" "$size"
52
+ fi
53
+ else
54
+ printf " %-20s : 없음\n" "$name"
55
+ fi
56
+ }
57
+
58
+ check_binary_data "data/korean_train.bin" "korean_train.bin"
59
+ check_binary_data "data/korean_val.bin" "korean_val.bin"
60
+ check_binary_data "data/train.bin" "train.bin"
61
+ check_binary_data "data/val.bin" "val.bin"
62
+
63
+ echo ""
64
+
65
+ # ============================================================================
66
+ # 2. 토크나이저 확인
67
+ # ============================================================================
68
+ echo "[ 토크나이저 ]"
69
+
70
+ check_tokenizer() {
71
+ local dir=$1
72
+ local name=$2
73
+
74
+ if [ -d "$dir" ]; then
75
+ local files=$(find "$dir" -type f | wc -l)
76
+ printf " %-20s : 존재 (%d개 파일)\n" "$name" "$files"
77
+ else
78
+ printf " %-20s : 없음\n" "$name"
79
+ fi
80
+ }
81
+
82
+ check_tokenizer "tokenizer/korean_sp" "korean_sp"
83
+ check_tokenizer "tokenizer" "default tokenizer"
84
+
85
+ echo ""
86
+
87
+ # ============================================================================
88
+ # 3. 원본 데이터 디렉토리 확인
89
+ # ============================================================================
90
+ echo "[ 원본 데이터 ]"
91
+
92
+ check_raw_data() {
93
+ local dir=$1
94
+ local name=$2
95
+
96
+ if [ -d "$dir" ]; then
97
+ local file_count=$(find "$dir" -maxdepth 1 -type f | wc -l)
98
+ local total_size=$(du -sh "$dir" 2>/dev/null | cut -f1)
99
+
100
+ if [ $file_count -eq 0 ]; then
101
+ printf " %-20s : 없음 (디렉토리만 존재, 0 파일)\n" "$name"
102
+ else
103
+ printf " %-20s : %'d 파일 (%s)\n" "$name" "$file_count" "$total_size"
104
+ fi
105
+ else
106
+ printf " %-20s : 없음\n" "$name"
107
+ fi
108
+ }
109
+
110
+ check_raw_data "data/raw/cc100_ko" "cc100_ko/"
111
+ check_raw_data "data/raw/c4_ko" "c4_ko/"
112
+ check_raw_data "data/raw/namuwiki_ko" "namuwiki_ko/"
113
+
114
+ # 위키 데이터는 raw/ 직접 하위
115
+ echo ""
116
+ echo "[ 위키피디아 데이터 ]"
117
+ ko_wiki_count=$(find "data/raw" -maxdepth 1 -name "ko_wiki_*.txt" | wc -l)
118
+ en_wiki_count=$(find "data/raw" -maxdepth 1 -name "en_wiki_*.txt" | wc -l)
119
+ ko_wiki_size=$(du -sh "data/raw" 2>/dev/null | cut -f1)
120
+
121
+ if [ $ko_wiki_count -gt 0 ]; then
122
+ printf " %-20s : %'d 파일\n" "ko_wiki" "$ko_wiki_count"
123
+ fi
124
+
125
+ if [ $en_wiki_count -gt 0 ]; then
126
+ printf " %-20s : %'d 파일\n" "en_wiki" "$en_wiki_count"
127
+ fi
128
+
129
+ echo ""
130
+
131
+ # ============================================================================
132
+ # 4. 종합 상태 요약
133
+ # ============================================================================
134
+ echo "[ 종합 상태 ]"
135
+
136
+ # 학습용 바이너리 데이터 확인
137
+ binary_ready=false
138
+ if [ -f "data/korean_train.bin" ] && [ -f "data/korean_val.bin" ]; then
139
+ binary_ready=true
140
+ elif [ -f "data/train.bin" ] && [ -f "data/val.bin" ]; then
141
+ binary_ready=true
142
+ fi
143
+
144
+ # 토크나이저 확인
145
+ tokenizer_ready=false
146
+ if [ -d "tokenizer/korean_sp" ] && [ -f "tokenizer/korean_sp/tokenizer.model" ]; then
147
+ tokenizer_ready=true
148
+ fi
149
+
150
+ # 원본 데이터 확인
151
+ raw_ready=false
152
+ if [ -d "data/raw/c4_ko" ] || [ -d "data/raw/namuwiki_ko" ] || [ -d "data/raw/cc100_ko" ]; then
153
+ count=$(find "data/raw/c4_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)
154
+ count=$((count + $(find "data/raw/namuwiki_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
155
+ count=$((count + $(find "data/raw/cc100_ko" -maxdepth 1 -type f 2>/dev/null | wc -l)))
156
+ if [ $count -gt 0 ]; then
157
+ raw_ready=true
158
+ fi
159
+ fi
160
+
161
+ printf " 학습용 바이너리 : %s\n" "$([ "$binary_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
162
+ printf " 토크나이저 : %s\n" "$([ "$tokenizer_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
163
+ printf " 원본 데이터 : %s\n" "$([ "$raw_ready" = true ] && echo "✓ 준비됨" || echo "✗ 미준비")"
164
+
165
+ echo ""
166
+
167
+ # ============================================================================
168
+ # 5. 학습 설정 파라미터 정보
169
+ # ============================================================================
170
+ echo "[ 학습 설정 (1B 모델 기준) ]"
171
+ echo " 배치 사이즈 : 8"
172
+ echo " 시퀀스 길이 : 4096"
173
+ echo " GPU 수 : 8"
174
+ echo " 그래디언트 누적 : 4"
175
+ echo " 토큰/스텝 : 8 × 4 × 4096 × 8 = 1,048,576"
176
+ echo ""
177
+
178
+ echo "=== 검사 완료 ==="
source/scripts/clickhouse-watchdog.sh ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # clickhouse-watchdog.sh — ClickHouse 헬스체크 + 자동 재시작
4
+ # crontab에 등록하여 1분마다 실행
5
+ #
6
+ # Usage:
7
+ # */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/clickhouse-watchdog.sh
8
+ #
9
+
10
+ set -euo pipefail
11
+
12
+ # ── 설정 ──────────────────────────────────────────────
13
+ CH_BIN="/PROJECT/0325120031_A/ghong/taketimes/clickhouse-bin"
14
+ CH_CONFIG="/PROJECT/0325120031_A/ghong/taketimes/llm-bang/configs/clickhouse-config.xml"
15
+ TCP_PORT=9000
16
+ HTTP_PORT=8123
17
+ HOST="127.0.0.1"
18
+
19
+ LOG_DIR="/tmp/clickhouse"
20
+ LOG_FILE="${LOG_DIR}/watchdog.log"
21
+ MAX_LOG_SIZE=$((10 * 1024 * 1024)) # 10MB 로테이션
22
+
23
+ RESTART_COOLDOWN=180 # 초 — 재시작 후 이 시간 내 재시도 방지
24
+ LAST_RESTART_FILE="/tmp/clickhouse-last-restart"
25
+ HEALTH_CHECK_TIMEOUT=5 # 초 — 헬스체크 curl/query 타임아웃
26
+
27
+ # ── 함수 ──────────────────────────────────────────────
28
+ mkdir -p "$LOG_DIR"
29
+
30
+ log() {
31
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] [clickhouse-watchdog] $*" >> "$LOG_FILE"
32
+ }
33
+
34
+ rotate_log() {
35
+ local file="$1"
36
+ if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
37
+ mv "$file" "${file}.old"
38
+ log "Log rotated: $file"
39
+ fi
40
+ }
41
+
42
+ is_tcp_port_open() {
43
+ if command -v ss &>/dev/null; then
44
+ ss -tlnH "sport = :${TCP_PORT}" 2>/dev/null | grep -q "$TCP_PORT"
45
+ else
46
+ (echo > /dev/tcp/"$HOST"/"$TCP_PORT") 2>/dev/null
47
+ fi
48
+ }
49
+
50
+ is_http_responding() {
51
+ # HTTP 인터페이스 핑 — ClickHouse는 GET / 에 "Ok.\n" 응답
52
+ if command -v curl &>/dev/null; then
53
+ local resp
54
+ resp=$(curl -s --max-time "$HEALTH_CHECK_TIMEOUT" "http://${HOST}:${HTTP_PORT}/ping" 2>/dev/null || true)
55
+ [[ "$resp" == "Ok." ]]
56
+ else
57
+ # curl 없으면 TCP 포트만 확인
58
+ (echo > /dev/tcp/"$HOST"/"$HTTP_PORT") 2>/dev/null
59
+ fi
60
+ }
61
+
62
+ is_process_alive() {
63
+ # ClickHouse 내부 watchdog 프로세스명: "clickhouse-watchdog" (바이너리 자체)
64
+ # 이 스크립트(clickhouse-watchdog.sh)와 구분하기 위해 --daemon 플래그 포함 패턴 사용
65
+ pgrep -f "clickhouse.*server.*--daemon" >/dev/null 2>&1
66
+ }
67
+
68
+ can_execute_query() {
69
+ # 실제 쿼리 실행으로 서버가 응답하는지 확인
70
+ local result
71
+ result=$("$CH_BIN" client --port "$TCP_PORT" --query "SELECT 1" 2>/dev/null || true)
72
+ [[ "$result" == "1" ]]
73
+ }
74
+
75
+ cooldown_active() {
76
+ if [[ -f "$LAST_RESTART_FILE" ]]; then
77
+ local last_restart now diff
78
+ last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
79
+ now=$(date +%s)
80
+ diff=$(( now - last_restart ))
81
+ if [[ $diff -lt $RESTART_COOLDOWN ]]; then
82
+ return 0 # 쿨다운 중
83
+ fi
84
+ fi
85
+ return 1 # 쿨다운 아님
86
+ }
87
+
88
+ stop_existing() {
89
+ log "Stopping existing ClickHouse processes..."
90
+ local my_pid=$$
91
+ local pids
92
+
93
+ # 정상 종료 시도 (서버 프로세스)
94
+ pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
95
+ if [[ -n "$pids" ]]; then
96
+ log "Sending TERM to PIDs: $pids"
97
+ echo "$pids" | xargs kill -TERM 2>/dev/null || true
98
+ sleep 3
99
+ # 아직 살아있으면 강제 종료
100
+ pids=$(pgrep -f "clickhouse.*server.*--daemon" 2>/dev/null | grep -v "^${my_pid}$" || true)
101
+ if [[ -n "$pids" ]]; then
102
+ log "Force killing PIDs: $pids"
103
+ echo "$pids" | xargs kill -9 2>/dev/null || true
104
+ sleep 2
105
+ fi
106
+ fi
107
+ }
108
+
109
+ start_server() {
110
+ log "Starting ClickHouse server (daemon mode)..."
111
+
112
+ # 기존 프로세스 정리
113
+ stop_existing
114
+
115
+ # 필요한 디렉토리 생성
116
+ mkdir -p /tmp/clickhouse/logs
117
+ mkdir -p /tmp/clickhouse-tmp
118
+
119
+ # 데몬 모드로 시작
120
+ "$CH_BIN" server --config-file="$CH_CONFIG" --daemon
121
+
122
+ # 시작 후 대기 + 확인 (최대 15초)
123
+ local attempts=0
124
+ local max_attempts=15
125
+ while [[ $attempts -lt $max_attempts ]]; do
126
+ sleep 1
127
+ attempts=$((attempts + 1))
128
+ if is_tcp_port_open && can_execute_query; then
129
+ date +%s > "$LAST_RESTART_FILE"
130
+ log "ClickHouse started successfully (took ${attempts}s)"
131
+ return 0
132
+ fi
133
+ done
134
+
135
+ date +%s > "$LAST_RESTART_FILE"
136
+ log "ERROR: ClickHouse did not respond within ${max_attempts}s after start"
137
+ return 1
138
+ }
139
+
140
+ # ── 메인 로직 ─────────────────────────────────────────
141
+ rotate_log "$LOG_FILE"
142
+
143
+ # 1) 바이너리 존재 확인
144
+ if [[ ! -x "$CH_BIN" ]]; then
145
+ log "FATAL: ClickHouse binary not found or not executable: $CH_BIN"
146
+ exit 1
147
+ fi
148
+
149
+ # 2) 프로세스 + 포트 + 쿼리 체크
150
+ process_ok=false
151
+ port_ok=false
152
+ query_ok=false
153
+
154
+ if is_process_alive; then
155
+ process_ok=true
156
+ fi
157
+
158
+ if is_tcp_port_open; then
159
+ port_ok=true
160
+ fi
161
+
162
+ if $port_ok && can_execute_query; then
163
+ query_ok=true
164
+ fi
165
+
166
+ # 3) 판단
167
+ if $process_ok && $port_ok && $query_ok; then
168
+ # 완전 정상 — 아무것도 안 함
169
+ exit 0
170
+ fi
171
+
172
+ # HTTP도 확인 (진단 로그용)
173
+ http_ok=false
174
+ if is_http_responding; then
175
+ http_ok=true
176
+ fi
177
+
178
+ # 비정상 상태 로깅
179
+ if $process_ok && $port_ok && ! $query_ok; then
180
+ log "WARN: Process alive, port open, but query failed. Possible hung state."
181
+ elif $process_ok && ! $port_ok; then
182
+ log "WARN: Process alive but TCP port $TCP_PORT not listening."
183
+ elif ! $process_ok; then
184
+ log "WARN: ClickHouse is completely down (no process found)."
185
+ fi
186
+ log "Status: process=$process_ok port=$port_ok query=$query_ok http=$http_ok"
187
+
188
+ # 4) 쿨다운 체크
189
+ if cooldown_active; then
190
+ log "Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
191
+ exit 0
192
+ fi
193
+
194
+ # 5) 재시작
195
+ log "Attempting ClickHouse restart..."
196
+ if start_server; then
197
+ log "ClickHouse restart SUCCESS"
198
+ else
199
+ log "ClickHouse restart FAILED"
200
+ exit 1
201
+ fi
source/scripts/convert_3b_gguf.sh ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # convert_3b_gguf.sh — 3B 모델 HuggingFace → GGUF 변환 + 다중 양자화
4
+ #
5
+ # Usage:
6
+ # bash scripts/convert_3b_gguf.sh [options]
7
+ #
8
+ # Options:
9
+ # --input_dir DIR HF 포맷 모델 디렉토리 (default: outputs/hf_korean_3b_orpo)
10
+ # --out_dir DIR GGUF 출력 디렉토리 (default: outputs/gguf)
11
+ # --checkpoint DIR 커스텀 체크포인트 디렉토리 (지정 시 HF 변환 선행 실행)
12
+ # --skip_hf_conv HF 변환 단계 건너뜀 (이미 HF 포맷 존재 시)
13
+ # --skip_quant 양자화 단계 건너뜀 (F16 GGUF만 생성)
14
+ #
15
+ # Pipeline:
16
+ # 1. [선택] 커스텀 체크포인트 → HF transformers 포맷 (convert_to_hf.py)
17
+ # 2. HF → F16 GGUF (llama.cpp/convert_hf_to_gguf.py)
18
+ # 3. F16 GGUF → Q4_K_M, Q5_K_M, Q8_0 양자화 (llama-quantize)
19
+ #
20
+ # Outputs:
21
+ # outputs/gguf/frankenstallm-3b-f16.gguf
22
+ # outputs/gguf/frankenstallm-3b-Q4_K_M.gguf — 권장 (Ollama용)
23
+ # outputs/gguf/frankenstallm-3b-Q5_K_M.gguf
24
+ # outputs/gguf/frankenstallm-3b-Q8_0.gguf
25
+ #
26
+ # 전제 조건:
27
+ # - python scripts/convert_to_hf.py 로 HF 변환 완료 (또는 --checkpoint 옵션)
28
+ # - git, cmake, make 설치
29
+ # - pip install safetensors
30
+ # =============================================================================
31
+ set -euo pipefail
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # 인자 파싱
35
+ # ---------------------------------------------------------------------------
36
+ INPUT_DIR="outputs/hf_korean_3b_orpo"
37
+ OUT_DIR="outputs/gguf"
38
+ CHECKPOINT_DIR=""
39
+ SKIP_HF_CONV=false
40
+ SKIP_QUANT=false
41
+
42
+ while [[ $# -gt 0 ]]; do
43
+ case "$1" in
44
+ --input_dir) INPUT_DIR="$2"; shift 2 ;;
45
+ --out_dir) OUT_DIR="$2"; shift 2 ;;
46
+ --checkpoint) CHECKPOINT_DIR="$2"; shift 2 ;;
47
+ --skip_hf_conv) SKIP_HF_CONV=true; shift ;;
48
+ --skip_quant) SKIP_QUANT=true; shift ;;
49
+ -h|--help)
50
+ grep '^#' "$0" | head -40 | sed 's/^# \{0,1\}//'
51
+ exit 0 ;;
52
+ *)
53
+ echo "ERROR: 알 수 없는 옵션: $1"
54
+ echo "Usage: bash scripts/convert_3b_gguf.sh [--input_dir DIR] [--out_dir DIR] [--checkpoint DIR] [--skip_hf_conv] [--skip_quant]"
55
+ exit 1 ;;
56
+ esac
57
+ done
58
+
59
+ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
60
+ LLAMA_CPP_DIR="${LLAMA_CPP_DIR:-$PROJECT_DIR/outputs/llama.cpp}"
61
+ MODEL_NAME="frankenstallm-3b"
62
+
63
+ cd "$PROJECT_DIR"
64
+
65
+ echo "=================================================================="
66
+ echo " 3B 모델 GGUF 변환 파이프라인"
67
+ echo " 입력 HF 디렉토리 : $INPUT_DIR"
68
+ echo " GGUF 출력 디렉토리: $OUT_DIR"
69
+ echo " llama.cpp 경로 : $LLAMA_CPP_DIR"
70
+ echo "=================================================================="
71
+ echo ""
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Step 0: llama.cpp 존재 여부 확인 / 클론
75
+ # ---------------------------------------------------------------------------
76
+ if [[ ! -d "$LLAMA_CPP_DIR" ]]; then
77
+ echo "[SETUP] llama.cpp 디렉토리가 없습니다."
78
+ echo " 다음 명령으로 설치하세요:"
79
+ echo ""
80
+ echo " git clone --depth 1 https://github.com/ggerganov/llama.cpp $LLAMA_CPP_DIR"
81
+ echo ""
82
+ echo " 또는 LLAMA_CPP_DIR 환경변수로 기존 경로를 지정하세요:"
83
+ echo " LLAMA_CPP_DIR=/path/to/llama.cpp bash scripts/convert_3b_gguf.sh"
84
+ echo ""
85
+ read -r -p "지금 자동 클론하시겠습니까? [y/N] " _yn
86
+ if [[ "${_yn:-N}" =~ ^[Yy]$ ]]; then
87
+ echo "Cloning llama.cpp ..."
88
+ git clone --depth 1 https://github.com/ggerganov/llama.cpp "$LLAMA_CPP_DIR"
89
+ else
90
+ echo "중단합니다. llama.cpp를 설치한 뒤 다시 실행하세요."
91
+ exit 1
92
+ fi
93
+ fi
94
+
95
+ # llama.cpp Python 의존성
96
+ echo "[SETUP] llama.cpp Python 의존성 설치 중 ..."
97
+ pip install -r "$LLAMA_CPP_DIR/requirements.txt" --break-system-packages -q
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # Step 1: 커스텀 체크포인트 → HF 포맷 변환 (선택)
101
+ # ---------------------------------------------------------------------------
102
+ if [[ -n "$CHECKPOINT_DIR" && "$SKIP_HF_CONV" == "false" ]]; then
103
+ echo ""
104
+ echo "[STEP 1] 커스텀 체크포인트 → HF 포맷 변환"
105
+ echo " 체크포인트: $CHECKPOINT_DIR"
106
+ echo " 출력 : $INPUT_DIR"
107
+ echo ""
108
+
109
+ if [[ ! -d "$CHECKPOINT_DIR" ]]; then
110
+ echo "ERROR: 체크포인트 디렉토리를 찾을 수 없습니다: $CHECKPOINT_DIR"
111
+ exit 1
112
+ fi
113
+
114
+ python "$PROJECT_DIR/scripts/convert_to_hf.py" \
115
+ --checkpoint "$CHECKPOINT_DIR" \
116
+ --output "$INPUT_DIR" \
117
+ --tokenizer "tokenizer/korean_sp/tokenizer.json"
118
+
119
+ echo " [OK] HF 변환 완료 → $INPUT_DIR"
120
+ elif [[ "$SKIP_HF_CONV" == "true" ]]; then
121
+ echo "[STEP 1] HF 변환 건너뜀 (--skip_hf_conv)"
122
+ else
123
+ echo "[STEP 1] 체크포인트 미지정 — HF 디렉토리를 직접 사용합니다."
124
+ fi
125
+
126
+ # HF 디렉토리 최종 검증
127
+ if [[ ! -d "$INPUT_DIR" ]]; then
128
+ echo "ERROR: HF 모델 디렉토리를 찾을 수 없습니다: $INPUT_DIR"
129
+ echo " --checkpoint 옵션으로 체크포인트를 지정하거나,"
130
+ echo " python scripts/convert_to_hf.py 를 먼저 실행하세요."
131
+ exit 1
132
+ fi
133
+
134
+ if [[ ! -f "$INPUT_DIR/config.json" ]]; then
135
+ echo "ERROR: config.json 이 없습니다: $INPUT_DIR/config.json"
136
+ exit 1
137
+ fi
138
+
139
+ mkdir -p "$OUT_DIR"
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Step 2: llama.cpp 빌드 (llama-quantize 바이너리)
143
+ # ---------------------------------------------------------------------------
144
+ QUANTIZE_BIN="$LLAMA_CPP_DIR/build/bin/llama-quantize"
145
+
146
+ if [[ ! -f "$QUANTIZE_BIN" ]]; then
147
+ echo ""
148
+ echo "[STEP 2] llama.cpp 빌드 중 (llama-quantize) ..."
149
+ cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" \
150
+ -DCMAKE_BUILD_TYPE=Release \
151
+ -DGGML_CUDA=ON \
152
+ 2>&1 | tail -10
153
+ cmake --build "$LLAMA_CPP_DIR/build" --target llama-quantize -j "$(nproc)" \
154
+ 2>&1 | tail -10
155
+ echo " [OK] 빌드 완료: $QUANTIZE_BIN"
156
+ else
157
+ echo "[STEP 2] llama-quantize 바이너리 이미 존재 — 빌드 건너뜀"
158
+ fi
159
+
160
+ # ---------------------------------------------------------------------------
161
+ # Step 3: HF → F16 GGUF 변환
162
+ # ---------------------------------------------------------------------------
163
+ F16_GGUF="$OUT_DIR/${MODEL_NAME}-f16.gguf"
164
+
165
+ echo ""
166
+ echo "[STEP 3] HF → F16 GGUF 변환"
167
+ echo " 입력: $INPUT_DIR"
168
+ echo " 출력: $F16_GGUF"
169
+ echo ""
170
+
171
+ python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" "$INPUT_DIR" \
172
+ --outfile "$F16_GGUF" \
173
+ --outtype f16
174
+
175
+ echo " [OK] F16 GGUF 크기: $(du -sh "$F16_GGUF" | cut -f1) ($F16_GGUF)"
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Step 4: 다중 양자화 (Q4_K_M, Q5_K_M, Q8_0)
179
+ # ---------------------------------------------------------------------------
180
+ if [[ "$SKIP_QUANT" == "true" ]]; then
181
+ echo ""
182
+ echo "[STEP 4] 양자화 건너뜀 (--skip_quant)"
183
+ else
184
+ echo ""
185
+ echo "[STEP 4] 다중 양자화 시작 ..."
186
+
187
+ if [[ ! -f "$QUANTIZE_BIN" ]]; then
188
+ echo "[WARN] llama-quantize 바이너리를 찾을 수 없습니다: $QUANTIZE_BIN"
189
+ echo " 양자화를 건너뜁니다. F16 GGUF만 생성되었습니다."
190
+ echo " 수동 빌드: cmake --build $LLAMA_CPP_DIR/build --target llama-quantize"
191
+ else
192
+ # Q4_K_M — 가장 작은 크기, 품질/속도 균형 (Ollama 기본 권장)
193
+ Q4KM_GGUF="$OUT_DIR/${MODEL_NAME}-Q4_K_M.gguf"
194
+ echo " → Q4_K_M 양자화: $Q4KM_GGUF ..."
195
+ "$QUANTIZE_BIN" "$F16_GGUF" "$Q4KM_GGUF" Q4_K_M
196
+ echo " 크기: $(du -sh "$Q4KM_GGUF" | cut -f1)"
197
+
198
+ # Q5_K_M — 중간 크기, 더 높은 품질
199
+ Q5KM_GGUF="$OUT_DIR/${MODEL_NAME}-Q5_K_M.gguf"
200
+ echo " → Q5_K_M 양자화: $Q5KM_GGUF ..."
201
+ "$QUANTIZE_BIN" "$F16_GGUF" "$Q5KM_GGUF" Q5_K_M
202
+ echo " 크기: $(du -sh "$Q5KM_GGUF" | cut -f1)"
203
+
204
+ # Q8_0 — 가장 높은 품질 (F16 근사)
205
+ Q8_GGUF="$OUT_DIR/${MODEL_NAME}-Q8_0.gguf"
206
+ echo " → Q8_0 양자화: $Q8_GGUF ..."
207
+ "$QUANTIZE_BIN" "$F16_GGUF" "$Q8_GGUF" Q8_0
208
+ echo " 크기: $(du -sh "$Q8_GGUF" | cut -f1)"
209
+
210
+ echo ""
211
+ echo " [OK] 모든 양자화 완료"
212
+ fi
213
+ fi
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # 완료 요약
217
+ # ---------------------------------------------------------------------------
218
+ echo ""
219
+ echo "=================================================================="
220
+ echo " 3B GGUF 변환 완료"
221
+ echo ""
222
+ echo " 출력 파일 목록:"
223
+ ls -lh "$OUT_DIR/${MODEL_NAME}"*.gguf 2>/dev/null | awk '{print " " $5 " " $9}' || \
224
+ echo " (파일 목록 확인: ls -lh $OUT_DIR/)"
225
+ echo ""
226
+ echo " 다음 단계:"
227
+ echo " bash scripts/deploy_3b_ollama.sh"
228
+ echo " bash scripts/quality_gate.sh deploy"
229
+ echo "=================================================================="
source/scripts/convert_to_gguf.sh ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # convert_to_gguf.sh — HuggingFace 포맷 모델을 GGUF로 변환 + Q4_K_M 양자화
4
+ #
5
+ # Usage:
6
+ # bash scripts/convert_to_gguf.sh [hf_dir] [out_dir]
7
+ #
8
+ # hf_dir : HF 포맷 모델 디렉토리 (default: outputs/hf)
9
+ # out_dir : GGUF 출력 디렉토리 (default: outputs/gguf)
10
+ #
11
+ # Outputs:
12
+ # outputs/gguf/korean-1b-f16.gguf — F16 GGUF
13
+ # outputs/gguf/korean-1b-q4km.gguf — Q4_K_M 양자화 (Ollama용)
14
+ #
15
+ # 전제 조건:
16
+ # - python scripts/convert_to_hf.py 로 HF 변환 완료
17
+ # - git, cmake, make 설치
18
+ # - pip install safetensors (없으면 pytorch_model.bin으로 fallback)
19
+ # =============================================================================
20
+ set -euo pipefail
21
+
22
+ HF_DIR="${1:-outputs/hf}"
23
+ OUT_DIR="${2:-outputs/gguf}"
24
+ LLAMA_CPP_DIR="outputs/llama.cpp"
25
+ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
26
+
27
+ cd "$PROJECT_DIR"
28
+
29
+ # --- Pre-flight check -------------------------------------------------------
30
+ if [[ ! -d "$HF_DIR" ]]; then
31
+ echo "ERROR: HF model directory not found: $HF_DIR"
32
+ echo "Run first: python scripts/convert_to_hf.py --checkpoint <ckpt> --output $HF_DIR"
33
+ exit 1
34
+ fi
35
+
36
+ if [[ ! -f "$HF_DIR/config.json" ]]; then
37
+ echo "ERROR: config.json not found in $HF_DIR"
38
+ exit 1
39
+ fi
40
+
41
+ mkdir -p "$OUT_DIR"
42
+
43
+ # --- Clone llama.cpp if not present -----------------------------------------
44
+ if [[ ! -d "$LLAMA_CPP_DIR" ]]; then
45
+ echo "Cloning llama.cpp ..."
46
+ git clone --depth 1 https://github.com/ggerganov/llama.cpp "$LLAMA_CPP_DIR"
47
+ fi
48
+
49
+ # Install Python requirements for conversion script
50
+ echo "Installing llama.cpp Python deps ..."
51
+ pip install -r "$LLAMA_CPP_DIR/requirements.txt" --break-system-packages -q
52
+
53
+ # --- Build llama.cpp (for quantization binary) ------------------------------
54
+ QUANTIZE_BIN="$LLAMA_CPP_DIR/build/bin/llama-quantize"
55
+ if [[ ! -f "$QUANTIZE_BIN" ]]; then
56
+ echo "Building llama.cpp (quantization tool) ..."
57
+ cmake -S "$LLAMA_CPP_DIR" -B "$LLAMA_CPP_DIR/build" \
58
+ -DCMAKE_BUILD_TYPE=Release \
59
+ -DGGML_CUDA=ON \
60
+ 2>&1 | tail -5
61
+ cmake --build "$LLAMA_CPP_DIR/build" --target llama-quantize -j "$(nproc)" \
62
+ 2>&1 | tail -5
63
+ fi
64
+
65
+ # --- F16 GGUF conversion ---------------------------------------------------
66
+ F16_GGUF="$OUT_DIR/korean-1b-f16.gguf"
67
+ echo "Converting to F16 GGUF: $F16_GGUF ..."
68
+ python "$LLAMA_CPP_DIR/convert_hf_to_gguf.py" "$HF_DIR" \
69
+ --outfile "$F16_GGUF" \
70
+ --outtype f16
71
+
72
+ echo "F16 GGUF size: $(du -sh "$F16_GGUF" | cut -f1)"
73
+
74
+ # --- Q4_K_M quantization ---------------------------------------------------
75
+ Q4KM_GGUF="$OUT_DIR/korean-1b-q4km.gguf"
76
+ if [[ -f "$QUANTIZE_BIN" ]]; then
77
+ echo "Quantizing to Q4_K_M: $Q4KM_GGUF ..."
78
+ "$QUANTIZE_BIN" "$F16_GGUF" "$Q4KM_GGUF" Q4_K_M
79
+ echo "Q4_K_M GGUF size: $(du -sh "$Q4KM_GGUF" | cut -f1)"
80
+ else
81
+ echo "[WARN] llama-quantize binary not found. Using F16 GGUF for Ollama."
82
+ echo " Build: cmake --build $LLAMA_CPP_DIR/build --target llama-quantize"
83
+ cp "$F16_GGUF" "$Q4KM_GGUF"
84
+ fi
85
+
86
+ echo ""
87
+ echo "=================================================================="
88
+ echo " GGUF 변환 완료"
89
+ echo " F16 : $F16_GGUF"
90
+ echo " Q4KM: $Q4KM_GGUF"
91
+ echo " 다음 단계: bash scripts/deploy_ollama.sh"
92
+ echo "=================================================================="
source/scripts/convert_to_hf.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert custom LLM checkpoint to HuggingFace LlamaForCausalLM format.
3
+
4
+ Usage:
5
+ python scripts/convert_to_hf.py \\
6
+ --checkpoint checkpoints/korean_1b_fp8_run1/checkpoint-0034000 \\
7
+ --output outputs/hf \\
8
+ [--tokenizer tokenizer/korean_sp/tokenizer.json]
9
+
10
+ Outputs (in --output directory):
11
+ config.json — LlamaConfig
12
+ model.safetensors — converted weights
13
+ tokenizer.json — tokenizer (copied)
14
+ tokenizer_config.json
15
+ generation_config.json
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import shutil
23
+ import sys
24
+ from pathlib import Path
25
+
26
+ import torch
27
+
28
+ _PROJECT_ROOT = Path(__file__).resolve().parent.parent
29
+ if str(_PROJECT_ROOT) not in sys.path:
30
+ sys.path.insert(0, str(_PROJECT_ROOT))
31
+
32
+ from model.config import LMConfig
33
+
34
+
35
+ def remap_weights(
36
+ src_state_dict: dict,
37
+ config: LMConfig,
38
+ ) -> dict:
39
+ """
40
+ Remap custom LLM weight names to HuggingFace LlamaForCausalLM names.
41
+
42
+ Handles both FP8 (te.LayerNormMLP / te.Linear) and BF16 (SwiGLU / nn.Linear)
43
+ checkpoints transparently.
44
+ """
45
+ dst = {}
46
+ is_fp8 = config.use_fp8
47
+
48
+ # --- Token embedding ---
49
+ dst["model.embed_tokens.weight"] = src_state_dict["embedding.weight"].float()
50
+
51
+ for i in range(config.n_layers):
52
+ pfx = f"layers.{i}"
53
+ hpfx = f"model.layers.{i}"
54
+
55
+ # Attention norm (always RMSNorm)
56
+ dst[f"{hpfx}.input_layernorm.weight"] = (
57
+ src_state_dict[f"{pfx}.attn_norm.weight"].float()
58
+ )
59
+
60
+ # Attention projections
61
+ # Handle fused QKV (te.Linear with qkv_proj) vs separate q/k/v
62
+ qkv_key = f"{pfx}.attn.qkv_proj.weight"
63
+ if qkv_key in src_state_dict:
64
+ # Fused QKV: [Q_dim + K_dim + V_dim, d_model]
65
+ # GQA: Q = n_heads * head_dim, K = V = n_kv_heads * head_dim
66
+ qkv = src_state_dict[qkv_key].float()
67
+ head_dim = config.d_model // config.n_heads
68
+ q_dim = config.n_heads * head_dim # e.g. 24 * 128 = 3072
69
+ k_dim = config.n_kv_heads * head_dim # e.g. 8 * 128 = 1024
70
+ v_dim = config.n_kv_heads * head_dim # e.g. 8 * 128 = 1024
71
+ assert qkv.shape[0] == q_dim + k_dim + v_dim, (
72
+ f"QKV shape mismatch: {qkv.shape[0]} != {q_dim}+{k_dim}+{v_dim}"
73
+ )
74
+ dst[f"{hpfx}.self_attn.q_proj.weight"] = qkv[:q_dim]
75
+ dst[f"{hpfx}.self_attn.k_proj.weight"] = qkv[q_dim:q_dim + k_dim]
76
+ dst[f"{hpfx}.self_attn.v_proj.weight"] = qkv[q_dim + k_dim:]
77
+ else:
78
+ # Separate q/k/v projections
79
+ for src_name, dst_name in [
80
+ ("q_proj", "self_attn.q_proj"),
81
+ ("k_proj", "self_attn.k_proj"),
82
+ ("v_proj", "self_attn.v_proj"),
83
+ ]:
84
+ w_key = f"{pfx}.attn.{src_name}.weight"
85
+ if w_key in src_state_dict:
86
+ dst[f"{hpfx}.{dst_name}.weight"] = src_state_dict[w_key].float()
87
+
88
+ # Output projection
89
+ out_key = f"{pfx}.attn.out_proj.weight"
90
+ if out_key in src_state_dict:
91
+ dst[f"{hpfx}.self_attn.o_proj.weight"] = src_state_dict[out_key].float()
92
+
93
+ # FFN — FP8 (te.LayerNormMLP) vs BF16 (SwiGLU)
94
+ if is_fp8 and f"{pfx}.ffn.layer_norm_weight" in src_state_dict:
95
+ # te.LayerNormMLP: RMSNorm is fused inside
96
+ dst[f"{hpfx}.post_attention_layernorm.weight"] = (
97
+ src_state_dict[f"{pfx}.ffn.layer_norm_weight"].float()
98
+ )
99
+ # fc1_weight: [2*d_ffn, d_model] — gate and up are concatenated
100
+ fc1 = src_state_dict[f"{pfx}.ffn.fc1_weight"].float()
101
+ half = fc1.shape[0] // 2
102
+ dst[f"{hpfx}.mlp.gate_proj.weight"] = fc1[:half]
103
+ dst[f"{hpfx}.mlp.up_proj.weight"] = fc1[half:]
104
+ # fc2_weight: [d_model, d_ffn]
105
+ dst[f"{hpfx}.mlp.down_proj.weight"] = (
106
+ src_state_dict[f"{pfx}.ffn.fc2_weight"].float()
107
+ )
108
+ else:
109
+ # Standard SwiGLU (BF16 checkpoint)
110
+ dst[f"{hpfx}.post_attention_layernorm.weight"] = (
111
+ src_state_dict[f"{pfx}.ffn_norm.weight"].float()
112
+ )
113
+ dst[f"{hpfx}.mlp.gate_proj.weight"] = (
114
+ src_state_dict[f"{pfx}.ffn.gate_proj.weight"].float()
115
+ )
116
+ dst[f"{hpfx}.mlp.up_proj.weight"] = (
117
+ src_state_dict[f"{pfx}.ffn.up_proj.weight"].float()
118
+ )
119
+ dst[f"{hpfx}.mlp.down_proj.weight"] = (
120
+ src_state_dict[f"{pfx}.ffn.down_proj.weight"].float()
121
+ )
122
+
123
+ # --- Final norm and LM head ---
124
+ dst["model.norm.weight"] = src_state_dict["norm.weight"].float()
125
+ # Weight tying: embedding.weight == lm_head.weight in our model.
126
+ # HF LlamaForCausalLM expects lm_head.weight explicitly.
127
+ dst["lm_head.weight"] = src_state_dict["embedding.weight"].float().clone()
128
+
129
+ return dst
130
+
131
+
132
+ def build_llama_config(config: LMConfig) -> dict:
133
+ """Map LMConfig fields to HuggingFace LlamaConfig dict."""
134
+ return {
135
+ "architectures": ["LlamaForCausalLM"],
136
+ "model_type": "llama",
137
+ "hidden_size": config.d_model,
138
+ "intermediate_size": config.d_ffn,
139
+ "num_hidden_layers": config.n_layers,
140
+ "num_attention_heads": config.n_heads,
141
+ "num_key_value_heads": config.n_kv_heads,
142
+ "hidden_act": "silu",
143
+ "max_position_embeddings": config.max_seq_len,
144
+ "initializer_range": 0.02,
145
+ "rms_norm_eps": 1e-5,
146
+ "vocab_size": config.vocab_size,
147
+ "rope_theta": config.rope_theta,
148
+ "rope_scaling": None,
149
+ "attention_bias": config.bias,
150
+ "tie_word_embeddings": True,
151
+ "torch_dtype": "float16",
152
+ "transformers_version": "4.40.0",
153
+ }
154
+
155
+
156
+ def main() -> None:
157
+ parser = argparse.ArgumentParser(
158
+ description="Convert custom LLM checkpoint to HuggingFace LlamaForCausalLM format."
159
+ )
160
+ parser.add_argument(
161
+ "--checkpoint",
162
+ required=True,
163
+ type=Path,
164
+ help="Path to checkpoint directory (must contain model.pt + config.yaml).",
165
+ )
166
+ parser.add_argument(
167
+ "--output",
168
+ required=True,
169
+ type=Path,
170
+ help="Output directory for HF-format files.",
171
+ )
172
+ parser.add_argument(
173
+ "--tokenizer",
174
+ type=Path,
175
+ default=Path("tokenizer/korean_sp/tokenizer.json"),
176
+ help="Path to tokenizer.json (default: tokenizer/korean_sp/tokenizer.json).",
177
+ )
178
+ args = parser.parse_args()
179
+
180
+ ckpt_path = args.checkpoint
181
+ out_path = args.output
182
+
183
+ if not ckpt_path.exists():
184
+ raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
185
+
186
+ out_path.mkdir(parents=True, exist_ok=True)
187
+ print(f"Checkpoint : {ckpt_path}")
188
+ print(f"Output : {out_path}")
189
+
190
+ # Load config
191
+ config = LMConfig.from_yaml(ckpt_path / "config.yaml")
192
+ print(f"Model : d_model={config.d_model}, n_layers={config.n_layers}, "
193
+ f"vocab_size={config.vocab_size}, use_fp8={config.use_fp8}")
194
+
195
+ # Load weights
196
+ print("Loading model.pt ...")
197
+ state_dict = torch.load(
198
+ ckpt_path / "model.pt",
199
+ map_location="cpu",
200
+ weights_only=True,
201
+ )
202
+ print(f" Source keys: {len(state_dict)}")
203
+
204
+ # Remap
205
+ print("Remapping weight names ...")
206
+ hf_state_dict = remap_weights(state_dict, config)
207
+ print(f" Destination keys: {len(hf_state_dict)}")
208
+
209
+ # Save safetensors
210
+ print("Saving model.safetensors ...")
211
+ try:
212
+ from safetensors.torch import save_file
213
+ save_file(hf_state_dict, out_path / "model.safetensors")
214
+ except ImportError:
215
+ print(" [WARN] safetensors not installed; falling back to pytorch_model.bin")
216
+ torch.save(hf_state_dict, out_path / "pytorch_model.bin")
217
+
218
+ # Save config.json
219
+ llama_cfg = build_llama_config(config)
220
+ with open(out_path / "config.json", "w", encoding="utf-8") as f:
221
+ json.dump(llama_cfg, f, indent=2, ensure_ascii=False)
222
+ print("Saved config.json")
223
+
224
+ # Save generation_config.json
225
+ gen_cfg = {
226
+ "bos_token_id": 1,
227
+ "eos_token_id": 2,
228
+ "pad_token_id": 0,
229
+ "max_new_tokens": 512,
230
+ "temperature": 0.8,
231
+ "top_p": 0.9,
232
+ "do_sample": True,
233
+ }
234
+ with open(out_path / "generation_config.json", "w", encoding="utf-8") as f:
235
+ json.dump(gen_cfg, f, indent=2, ensure_ascii=False)
236
+
237
+ # Copy tokenizer
238
+ tok_src = args.tokenizer
239
+ if tok_src.exists():
240
+ shutil.copy(tok_src, out_path / "tokenizer.json")
241
+ # Minimal tokenizer_config.json for HF compatibility
242
+ tok_cfg = {
243
+ "model_type": "llama",
244
+ "tokenizer_class": "PreTrainedTokenizerFast",
245
+ "bos_token": "<s>",
246
+ "eos_token": "</s>",
247
+ "unk_token": "<unk>",
248
+ "pad_token": "<pad>",
249
+ "clean_up_tokenization_spaces": False,
250
+ }
251
+ with open(out_path / "tokenizer_config.json", "w", encoding="utf-8") as f:
252
+ json.dump(tok_cfg, f, indent=2, ensure_ascii=False)
253
+ print(f"Copied tokenizer: {tok_src} -> {out_path / 'tokenizer.json'}")
254
+ else:
255
+ print(f"[WARN] Tokenizer not found at {tok_src}. Copy manually.")
256
+
257
+ print(f"\nDone! HF model saved to: {out_path}")
258
+ print("Verify: ls -lh", out_path)
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()
source/scripts/deploy_3b_ollama.sh ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # deploy_3b_ollama.sh — 3B GGUF 모델을 Ollama에 등록 & 자동 테스트
4
+ #
5
+ # Usage:
6
+ # bash scripts/deploy_3b_ollama.sh [model_name]
7
+ #
8
+ # model_name: Ollama 모델 이름 (default: frankenstallm-3b)
9
+ #
10
+ # 전제 조건:
11
+ # - ollama 설치: https://ollama.com/download
12
+ # - bash scripts/convert_3b_gguf.sh 실행 완료
13
+ # - outputs/gguf/frankenstallm-3b-Q4_K_M.gguf 존재
14
+ # - Modelfile.3b 존재
15
+ # =============================================================================
16
+ set -euo pipefail
17
+
18
+ MODEL_NAME="${1:-frankenstallm-3b}"
19
+ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
20
+ MODELFILE="$PROJECT_DIR/Modelfile.3b"
21
+ GGUF_PATH="$PROJECT_DIR/outputs/gguf/frankenstallm-3b-Q4_K_M.gguf"
22
+
23
+ cd "$PROJECT_DIR"
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Pre-flight check
27
+ # ---------------------------------------------------------------------------
28
+ if ! command -v ollama &> /dev/null; then
29
+ echo "ERROR: ollama가 설치되어 있지 않습니다."
30
+ echo "설치: curl -fsSL https://ollama.com/install.sh | sh"
31
+ exit 1
32
+ fi
33
+
34
+ if [[ ! -f "$GGUF_PATH" ]]; then
35
+ echo "ERROR: GGUF 파일을 찾을 수 없습니다: $GGUF_PATH"
36
+ echo "먼저 실행: bash scripts/convert_3b_gguf.sh"
37
+ exit 1
38
+ fi
39
+
40
+ if [[ ! -f "$MODELFILE" ]]; then
41
+ echo "ERROR: Modelfile.3b 를 찾을 수 없습니다: $MODELFILE"
42
+ echo " 프로젝트 루트에 Modelfile.3b 가 있어야 합니다."
43
+ exit 1
44
+ fi
45
+
46
+ echo "=================================================================="
47
+ echo " 3B 모델 Ollama 배포"
48
+ echo " 모델명 : $MODEL_NAME"
49
+ echo " GGUF : $(du -sh "$GGUF_PATH" | cut -f1) ($GGUF_PATH)"
50
+ echo " Modelfile: $MODELFILE"
51
+ echo "=================================================================="
52
+ echo ""
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Ollama 서버 실행 확인
56
+ # ---------------------------------------------------------------------------
57
+ if ! ollama list &>/dev/null; then
58
+ echo "[WARN] Ollama 서버가 응답하지 않습니다. 백그라운드로 시작합니다 ..."
59
+ ollama serve &>/tmp/ollama_serve.log &
60
+ OLLAMA_PID=$!
61
+ echo " PID: $OLLAMA_PID (로그: /tmp/ollama_serve.log)"
62
+ # 서버 준비 대기 (최대 15초)
63
+ for i in $(seq 1 15); do
64
+ if ollama list &>/dev/null 2>&1; then
65
+ echo " [OK] Ollama 서버 준비 완료 (${i}초)"
66
+ break
67
+ fi
68
+ sleep 1
69
+ done
70
+ fi
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Ollama 모델 등록
74
+ # ---------------------------------------------------------------------------
75
+ echo "[1/2] Ollama 모델 등록 중: $MODEL_NAME ..."
76
+ ollama create "$MODEL_NAME" -f "$MODELFILE"
77
+ echo " [OK] 등록 완료"
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # 자동 테스트 프롬프트 5개 실행
81
+ # ---------------------------------------------------------------------------
82
+ echo ""
83
+ echo "[2/2] 자동 테스트 프롬프트 실행 (5개) ..."
84
+ echo ""
85
+
86
+ declare -a TEST_PROMPTS=(
87
+ "안녕하세요! 간단히 자기소개를 해주세요."
88
+ "대한민국의 수도는 어디인가요? 그 도시의 특징을 설명해주세요."
89
+ "파이썬으로 피보나치 수열을 출력하는 함수를 작성해주세요."
90
+ "인공지능이 사회에 미치는 긍정적인 영향 3가지를 설명해주세요."
91
+ "오늘 저녁 메뉴로 무엇을 추천해주시겠어요? 이유도 함께 말씀해주세요."
92
+ )
93
+
94
+ PASS_COUNT=0
95
+ FAIL_COUNT=0
96
+ TOTAL=${#TEST_PROMPTS[@]}
97
+
98
+ for i in "${!TEST_PROMPTS[@]}"; do
99
+ PROMPT="${TEST_PROMPTS[$i]}"
100
+ NUM=$((i + 1))
101
+ echo "--- 테스트 $NUM/$TOTAL ---"
102
+ echo "프롬프트: $PROMPT"
103
+ echo ""
104
+
105
+ # ollama run: 타임아웃 60초, 응답 첫 300자만 표시
106
+ if RESPONSE=$(timeout 60 ollama run "$MODEL_NAME" "$PROMPT" 2>&1); then
107
+ RESP_PREVIEW="${RESPONSE:0:300}"
108
+ echo "응답: $RESP_PREVIEW"
109
+ if [[ ${#RESPONSE} -gt 300 ]]; then
110
+ echo " ... (총 ${#RESPONSE}자)"
111
+ fi
112
+ echo "[OK] 테스트 $NUM 성공"
113
+ PASS_COUNT=$((PASS_COUNT + 1))
114
+ else
115
+ EXIT_CODE=$?
116
+ echo "[FAIL] 테스트 $NUM 실패 (exit code: $EXIT_CODE)"
117
+ FAIL_COUNT=$((FAIL_COUNT + 1))
118
+ fi
119
+ echo ""
120
+ done
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # 결과 요약
124
+ # ---------------------------------------------------------------------------
125
+ echo "=================================================================="
126
+ echo " 배포 & 테스트 완료"
127
+ echo ""
128
+ echo " 모델명 : $MODEL_NAME"
129
+ echo " 테스트 : $PASS_COUNT/$TOTAL 성공 ($FAIL_COUNT 실패)"
130
+ echo ""
131
+ if [[ $FAIL_COUNT -eq 0 ]]; then
132
+ echo " [PASS] 모든 테스트 통과"
133
+ else
134
+ echo " [WARN] 일부 테스트 실패 — 로그를 확인하��요"
135
+ fi
136
+ echo ""
137
+ echo " Ollama 사용법:"
138
+ echo " ollama run $MODEL_NAME"
139
+ echo " ollama run $MODEL_NAME '질문을 여기에 입력하세요'"
140
+ echo " ollama rm $MODEL_NAME (삭제)"
141
+ echo ""
142
+ echo " Quality Gate:"
143
+ echo " bash scripts/quality_gate.sh deploy"
144
+ echo "=================================================================="
145
+
146
+ [[ $FAIL_COUNT -gt 0 ]] && exit 1 || exit 0
source/scripts/deploy_ollama.sh ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # deploy_ollama.sh — FRANKENSTALLM 3B GGUF → Ollama 원클릭 배포
4
+ #
5
+ # Usage:
6
+ # bash scripts/deploy_ollama.sh # 기본 (Q4_K_M)
7
+ # bash scripts/deploy_ollama.sh --quant Q8_0 # Q8_0 양자화
8
+ # bash scripts/deploy_ollama.sh --skip_convert # GGUF 이미 존재 시
9
+ #
10
+ # Pipeline:
11
+ # 1. [선택] GGUF 변환 + 양자화 (convert_3b_gguf.sh)
12
+ # 2. Ollama 설치 확인 / 서버 시작
13
+ # 3. Modelfile.3b로 모델 등록
14
+ # 4. 자동 테스트 (5개 프롬프트)
15
+ # 5. 반복률 검증 (15개 프롬프트)
16
+ # =============================================================================
17
+ set -euo pipefail
18
+
19
+ QUANT="${QUANT:-Q4_K_M}"
20
+ MODEL_NAME="frankenstallm-3b"
21
+ SKIP_CONVERT=false
22
+
23
+ while [[ $# -gt 0 ]]; do
24
+ case "$1" in
25
+ --quant) QUANT="$2"; shift 2 ;;
26
+ --skip_convert) SKIP_CONVERT=true; shift ;;
27
+ -h|--help)
28
+ grep '^#' "$0" | head -20 | sed 's/^# \{0,1\}//'
29
+ exit 0 ;;
30
+ *) echo "ERROR: 알 수 없는 옵션: $1"; exit 1 ;;
31
+ esac
32
+ done
33
+
34
+ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
35
+ cd "$PROJECT_DIR"
36
+
37
+ GGUF_PATH="outputs/gguf/frankenstallm-3b-${QUANT}.gguf"
38
+ MODELFILE="Modelfile.3b"
39
+
40
+ echo "=================================================================="
41
+ echo " FRANKENSTALLM 3B Ollama 배포"
42
+ echo " 양자화 : $QUANT"
43
+ echo " GGUF : $GGUF_PATH"
44
+ echo " Modelfile: $MODELFILE"
45
+ echo "=================================================================="
46
+
47
+ # ---- Step 1: GGUF 변환 (필요 시) ----
48
+ if [[ "$SKIP_CONVERT" == "false" ]]; then
49
+ if [[ ! -f "$GGUF_PATH" ]]; then
50
+ echo ""
51
+ echo "[Step 1] GGUF 변환 실행 중 ..."
52
+ bash scripts/convert_3b_gguf.sh \
53
+ --input_dir checkpoints/korean_3b_orpo_v1/checkpoint-9840
54
+ else
55
+ echo "[Step 1] GGUF 파일 이미 존재 — 변환 건너뜀"
56
+ fi
57
+ else
58
+ echo "[Step 1] 변환 건너뜀 (--skip_convert)"
59
+ fi
60
+
61
+ if [[ ! -f "$GGUF_PATH" ]]; then
62
+ echo "ERROR: GGUF 파일 없음: $GGUF_PATH"
63
+ exit 1
64
+ fi
65
+
66
+ echo " GGUF 크기: $(du -sh "$GGUF_PATH" | cut -f1)"
67
+
68
+ # ---- Step 2: Ollama 설치 확인 ----
69
+ if ! command -v ollama &>/dev/null; then
70
+ echo ""
71
+ echo "[Step 2] Ollama 미설치 — 설치 중 ..."
72
+ curl -fsSL https://ollama.com/install.sh | sh
73
+ fi
74
+
75
+ # Ollama 서버 시작
76
+ if ! ollama list &>/dev/null 2>&1; then
77
+ echo "[Step 2] Ollama 서버 시작 중 ..."
78
+ ollama serve &>/tmp/ollama_serve.log &
79
+ for i in $(seq 1 15); do
80
+ if ollama list &>/dev/null 2>&1; then
81
+ echo " [OK] Ollama 서버 준비 (${i}초)"
82
+ break
83
+ fi
84
+ sleep 1
85
+ done
86
+ fi
87
+
88
+ # ---- Step 3: 모델 등록 ----
89
+ echo ""
90
+ echo "[Step 3] Ollama 모델 등록: $MODEL_NAME"
91
+ ollama create "$MODEL_NAME" -f "$MODELFILE"
92
+ echo " [OK] 등록 완료"
93
+
94
+ # ---- Step 4: 자동 테스트 ----
95
+ echo ""
96
+ echo "[Step 4] 자동 테스트 ..."
97
+ declare -a QUICK_TESTS=(
98
+ "대한민국의 수도는?"
99
+ "인공지능이란 무엇인가요?"
100
+ "한국의 전통 음식 중에서 김치에 대해 설명해주세요."
101
+ )
102
+
103
+ for prompt in "${QUICK_TESTS[@]}"; do
104
+ echo " Q: $prompt"
105
+ RESP=$(timeout 60 ollama run "$MODEL_NAME" "$prompt" 2>&1 || echo "[TIMEOUT/ERROR]")
106
+ echo " A: ${RESP:0:200}"
107
+ echo ""
108
+ done
109
+
110
+ # ---- Step 5: 반복률 검증 ----
111
+ echo "[Step 5] 반복률 검증 (15개 프롬프트) ..."
112
+ python3 scripts/test_ollama_repetition.py --model "$MODEL_NAME"
113
+
114
+ echo ""
115
+ echo "=================================================================="
116
+ echo " 배포 완료!"
117
+ echo " 사용법: ollama run $MODEL_NAME"
118
+ echo "=================================================================="
source/scripts/fix_tokenizer_byte_fallback.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Fix GGUF newline crash by adding byte-fallback tokens to the tokenizer.
3
+
4
+ Problem: The SentencePiece Unigram tokenizer was trained without byte_fallback=True,
5
+ so characters like \n have no token representation. llama.cpp crashes when it
6
+ encounters these characters because there's no byte-fallback.
7
+
8
+ Fix:
9
+ 1. Add 256 byte-fallback tokens (<0x00> .. <0xFF>) to tokenizer.json
10
+ 2. Resize model embeddings from 64000 -> 64256
11
+ 3. Update config.json vocab_size
12
+ 4. Copy tokenizer.model for proper GGUF conversion
13
+
14
+ Usage:
15
+ python scripts/fix_tokenizer_byte_fallback.py \
16
+ --input outputs/hf_checkpoint-best \
17
+ --output outputs/hf_checkpoint-best-fixed \
18
+ --sp_model tokenizer/korean_sp/tokenizer.model
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import shutil
24
+ from pathlib import Path
25
+
26
+ import torch
27
+ from safetensors.torch import load_file, save_file
28
+
29
+
30
+ BYTE_FALLBACK_COUNT = 256
31
+ BYTE_TOKEN_TEMPLATE = "<0x{:02X}>"
32
+
33
+
34
+ def fix_tokenizer_json(input_path: Path, output_path: Path):
35
+ """Add byte_fallback=True and 256 byte tokens to tokenizer.json."""
36
+ with open(input_path) as f:
37
+ tok = json.load(f)
38
+
39
+ model = tok["model"]
40
+ vocab = model["vocab"] # list of [piece, score]
41
+ original_size = len(vocab)
42
+
43
+ # Enable byte_fallback
44
+ model["byte_fallback"] = True
45
+
46
+ # Add 256 byte tokens with very low score (they're fallback only)
47
+ for i in range(BYTE_FALLBACK_COUNT):
48
+ byte_token = BYTE_TOKEN_TEMPLATE.format(i)
49
+ vocab.append([byte_token, 0.0])
50
+
51
+ new_size = len(vocab)
52
+ print(f" Vocab: {original_size} -> {new_size} (+{BYTE_FALLBACK_COUNT} byte tokens)")
53
+ print(f" byte_fallback: False -> True")
54
+
55
+ # Also add byte tokens to added_tokens list
56
+ added = tok.get("added_tokens", [])
57
+ for i in range(BYTE_FALLBACK_COUNT):
58
+ byte_token = BYTE_TOKEN_TEMPLATE.format(i)
59
+ added.append({
60
+ "id": original_size + i,
61
+ "content": byte_token,
62
+ "single_word": False,
63
+ "lstrip": False,
64
+ "rstrip": False,
65
+ "normalized": False,
66
+ "special": True,
67
+ })
68
+ tok["added_tokens"] = added
69
+
70
+ with open(output_path, "w") as f:
71
+ json.dump(tok, f, ensure_ascii=False, indent=2)
72
+
73
+ return original_size, new_size
74
+
75
+
76
+ def fix_config_json(input_path: Path, output_path: Path, new_vocab_size: int):
77
+ """Update vocab_size in config.json."""
78
+ with open(input_path) as f:
79
+ config = json.load(f)
80
+
81
+ old_size = config["vocab_size"]
82
+ config["vocab_size"] = new_vocab_size
83
+ print(f" config.json vocab_size: {old_size} -> {new_vocab_size}")
84
+
85
+ with open(output_path, "w") as f:
86
+ json.dump(config, f, indent=2)
87
+
88
+
89
+ def resize_embeddings(input_path: Path, output_path: Path,
90
+ old_vocab: int, new_vocab: int, tie_embeddings: bool):
91
+ """Resize embedding and lm_head weights to accommodate new tokens."""
92
+ print(f" Loading model weights from {input_path} ...")
93
+ state_dict = load_file(str(input_path))
94
+
95
+ embed_key = "model.embed_tokens.weight"
96
+ lm_head_key = "lm_head.weight"
97
+
98
+ if embed_key not in state_dict:
99
+ raise KeyError(f"{embed_key} not found in state_dict. Keys: {list(state_dict.keys())[:10]}")
100
+
101
+ embed = state_dict[embed_key]
102
+ print(f" embed_tokens shape: {embed.shape}")
103
+
104
+ hidden_size = embed.shape[1]
105
+ extra = new_vocab - old_vocab
106
+
107
+ # Initialize new embeddings as mean of existing (better than random for byte tokens)
108
+ mean_embed = embed.mean(dim=0, keepdim=True)
109
+ # Add small noise to avoid identical embeddings
110
+ noise = torch.randn(extra, hidden_size, dtype=embed.dtype) * 0.01
111
+ new_rows = mean_embed.expand(extra, -1) + noise
112
+
113
+ new_embed = torch.cat([embed, new_rows], dim=0)
114
+ state_dict[embed_key] = new_embed
115
+ print(f" embed_tokens resized: {embed.shape} -> {new_embed.shape}")
116
+
117
+ if tie_embeddings:
118
+ # When tie_word_embeddings=True, lm_head shares embed_tokens
119
+ # Remove lm_head if present (it will be tied automatically)
120
+ if lm_head_key in state_dict:
121
+ del state_dict[lm_head_key]
122
+ print(f" lm_head removed (tie_word_embeddings=True)")
123
+ else:
124
+ if lm_head_key in state_dict:
125
+ lm_head = state_dict[lm_head_key]
126
+ mean_lm = lm_head.mean(dim=0, keepdim=True)
127
+ noise_lm = torch.randn(extra, hidden_size, dtype=lm_head.dtype) * 0.01
128
+ new_lm = torch.cat([lm_head, mean_lm.expand(extra, -1) + noise_lm], dim=0)
129
+ state_dict[lm_head_key] = new_lm
130
+ print(f" lm_head resized: {lm_head.shape} -> {new_lm.shape}")
131
+
132
+ print(f" Saving to {output_path} ...")
133
+ save_file(state_dict, str(output_path))
134
+
135
+
136
+ def main():
137
+ parser = argparse.ArgumentParser(description="Fix tokenizer byte-fallback for GGUF")
138
+ parser.add_argument("--input", type=Path, required=True, help="Input HF checkpoint dir")
139
+ parser.add_argument("--output", type=Path, required=True, help="Output fixed HF checkpoint dir")
140
+ parser.add_argument("--sp_model", type=Path, default=None,
141
+ help="SentencePiece .model file to copy (for GGUF conversion)")
142
+ args = parser.parse_args()
143
+
144
+ input_dir = args.input
145
+ output_dir = args.output
146
+
147
+ if not input_dir.exists():
148
+ print(f"ERROR: Input directory not found: {input_dir}")
149
+ return 1
150
+
151
+ output_dir.mkdir(parents=True, exist_ok=True)
152
+
153
+ # Load config to check tie_word_embeddings
154
+ with open(input_dir / "config.json") as f:
155
+ config = json.load(f)
156
+ old_vocab = config["vocab_size"]
157
+ new_vocab = old_vocab + BYTE_FALLBACK_COUNT
158
+ tie_embeddings = config.get("tie_word_embeddings", False)
159
+
160
+ print(f"=== Byte-Fallback Fix ===")
161
+ print(f"Input: {input_dir}")
162
+ print(f"Output: {output_dir}")
163
+ print(f"Old vocab: {old_vocab}, New vocab: {new_vocab}")
164
+ print(f"tie_word_embeddings: {tie_embeddings}")
165
+ print()
166
+
167
+ # 1. Fix tokenizer.json
168
+ print("[1/4] Fixing tokenizer.json ...")
169
+ fix_tokenizer_json(
170
+ input_dir / "tokenizer.json",
171
+ output_dir / "tokenizer.json",
172
+ )
173
+
174
+ # 2. Fix config.json
175
+ print("[2/4] Fixing config.json ...")
176
+ fix_config_json(
177
+ input_dir / "config.json",
178
+ output_dir / "config.json",
179
+ new_vocab,
180
+ )
181
+
182
+ # 3. Resize model weights
183
+ print("[3/4] Resizing embeddings ...")
184
+ resize_embeddings(
185
+ input_dir / "model.safetensors",
186
+ output_dir / "model.safetensors",
187
+ old_vocab, new_vocab, tie_embeddings,
188
+ )
189
+
190
+ # 4. Copy other files
191
+ print("[4/4] Copying remaining files ...")
192
+ for fname in ["tokenizer_config.json", "generation_config.json"]:
193
+ src = input_dir / fname
194
+ if src.exists():
195
+ shutil.copy2(src, output_dir / fname)
196
+ print(f" Copied {fname}")
197
+
198
+ # Copy SentencePiece model if provided (needed for GGUF conversion)
199
+ if args.sp_model and args.sp_model.exists():
200
+ shutil.copy2(args.sp_model, output_dir / "tokenizer.model")
201
+ print(f" Copied tokenizer.model from {args.sp_model}")
202
+ elif (input_dir / "tokenizer.model").exists():
203
+ shutil.copy2(input_dir / "tokenizer.model", output_dir / "tokenizer.model")
204
+ print(f" Copied tokenizer.model from input dir")
205
+
206
+ # Update tokenizer_config.json to add added_tokens_decoder for byte tokens
207
+ tc_path = output_dir / "tokenizer_config.json"
208
+ if tc_path.exists():
209
+ with open(tc_path) as f:
210
+ tc = json.load(f)
211
+ added_tokens_decoder = tc.get("added_tokens_decoder", {})
212
+ for i in range(BYTE_FALLBACK_COUNT):
213
+ token_id = old_vocab + i
214
+ byte_token = BYTE_TOKEN_TEMPLATE.format(i)
215
+ added_tokens_decoder[str(token_id)] = {
216
+ "content": byte_token,
217
+ "lstrip": False,
218
+ "normalized": False,
219
+ "rstrip": False,
220
+ "single_word": False,
221
+ "special": True,
222
+ }
223
+ tc["added_tokens_decoder"] = added_tokens_decoder
224
+ with open(tc_path, "w") as f:
225
+ json.dump(tc, f, indent=2)
226
+ print(f" Updated tokenizer_config.json with {BYTE_FALLBACK_COUNT} byte tokens")
227
+
228
+ print()
229
+ print(f"=== Done! Fixed checkpoint at: {output_dir} ===")
230
+ print(f"Next: python outputs/llama.cpp/convert_hf_to_gguf.py {output_dir} --outfile outputs/gguf/frankenstallm-3b-f16.gguf --outtype f16")
231
+ return 0
232
+
233
+
234
+ if __name__ == "__main__":
235
+ raise SystemExit(main())
source/scripts/hourly_status.sh ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # hourly_status.sh — FRANKENSTALLM 3B Hourly Training Status Report (Telegram)
4
+ # Run: every hour via cron
5
+ # Sends a rich formatted message with progress, loss, ETA, GPU/disk summary.
6
+ # =============================================================================
7
+ set -euo pipefail
8
+
9
+ # ─── Paths ───────────────────────────────────────────────────────────────────
10
+ WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
11
+ CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
12
+ LOG_FILE="$CKPT_DIR/train.log"
13
+ PID_FILE="$CKPT_DIR/train.pid"
14
+ HOURLY_LOG="$CKPT_DIR/hourly_status.log"
15
+ NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"
16
+
17
+ TOTAL_STEPS=57000
18
+ TOTAL_TOKENS_B=114 # billion tokens target (57K steps × batch)
19
+
20
+ # ─── Helpers ─────────────────────────────────────────────────────────────────
21
+ ts() { date '+%Y-%m-%d %H:%M:%S'; }
22
+ log() { echo "[$(ts)] $*"; }
23
+
24
+ # Safely get last matching value from log
25
+ parse_last() {
26
+ local pattern="$1"
27
+ grep -oP "$pattern" "$LOG_FILE" 2>/dev/null | tail -1 || echo ""
28
+ }
29
+
30
+ # ─── Parse training log ───────────────────────────────────────────────────────
31
+ parse_log() {
32
+ if [[ ! -f "$LOG_FILE" ]]; then
33
+ echo "NO_LOG"
34
+ return 1
35
+ fi
36
+
37
+ # Get the last step line
38
+ LAST_LINE=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1 || echo "")
39
+ if [[ -z "$LAST_LINE" ]]; then
40
+ echo "NO_STEPS"
41
+ return 1
42
+ fi
43
+
44
+ CURRENT_STEP=$(echo "$LAST_LINE" | grep -oP 'step\s+\K[0-9]+' || echo "0")
45
+ CURRENT_LOSS=$(echo "$LAST_LINE" | grep -oP 'loss\s+\K[0-9.]+' || echo "N/A")
46
+ CURRENT_LR=$(echo "$LAST_LINE" | grep -oP 'lr\s+\K[0-9.e+-]+' || echo "N/A")
47
+ CURRENT_GNORM=$(echo "$LAST_LINE" | grep -oP 'gnorm\s+\K[0-9.]+' || echo "N/A")
48
+ CURRENT_TOKPS=$(echo "$LAST_LINE" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "0")
49
+ CURRENT_MEM=$(echo "$LAST_LINE" | grep -oP 'mem\s+\K[0-9.]+GB' || echo "N/A")
50
+ CURRENT_EPOCH=$(echo "$LAST_LINE" | grep -oP 'epoch\s+\K[0-9]+' || echo "0")
51
+
52
+ # Log timestamp — parse from the line itself
53
+ LOG_TS=$(echo "$LAST_LINE" | grep -oP '\[\K[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' || echo "unknown")
54
+
55
+ return 0
56
+ }
57
+
58
+ # ─── Calculate progress & ETA ─────────────────────────────────────────────────
59
+ compute_eta() {
60
+ local step="$1"
61
+ local tokps="$2"
62
+
63
+ # Progress
64
+ PROGRESS_PCT=$(echo "scale=1; $step * 100 / $TOTAL_STEPS" | bc -l 2>/dev/null || echo "0")
65
+
66
+ # Steps remaining
67
+ STEPS_LEFT=$(( TOTAL_STEPS - step ))
68
+
69
+ # Tokens processed so far (approx: step × 2M tokens/step for 3B, bs=4, seqlen=4096, 8gpu)
70
+ # bs=4, accum=8, 8gpu → effective batch = 4*8*8=256 sequences × 4096 tokens = 1,048,576 ≈ 1M tok/step
71
+ TOKENS_PROCESSED_B=$(echo "scale=2; $step * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
72
+
73
+ # ETA using current tok/s
74
+ if [[ "$tokps" -gt 0 ]]; then
75
+ # tokens remaining
76
+ local tokens_left_b
77
+ tokens_left_b=$(echo "scale=2; ($TOTAL_STEPS - $step) * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
78
+ local tokens_left
79
+ tokens_left=$(echo "scale=0; ($TOTAL_STEPS - $step) * 1048576" | bc -l 2>/dev/null || echo "0")
80
+ local secs_left
81
+ secs_left=$(echo "scale=0; $tokens_left / $tokps" | bc -l 2>/dev/null || echo "0")
82
+
83
+ ETA_HOURS=$(echo "scale=1; $secs_left / 3600" | bc -l 2>/dev/null || echo "N/A")
84
+ if [[ "$ETA_HOURS" != "N/A" ]]; then
85
+ local eta_epoch
86
+ eta_epoch=$(( $(date +%s) + secs_left ))
87
+ ETA_DATETIME=$(date -d "@$eta_epoch" '+%m/%d %H:%M' 2>/dev/null || echo "N/A")
88
+ else
89
+ ETA_DATETIME="N/A"
90
+ fi
91
+ else
92
+ ETA_HOURS="N/A"
93
+ ETA_DATETIME="N/A"
94
+ fi
95
+ }
96
+
97
+ # ─── GPU summary ─────────────────────────────────────────────────────────────
98
+ get_gpu_summary() {
99
+ if ! command -v nvidia-smi &>/dev/null; then
100
+ GPU_SUMMARY="nvidia-smi not available"
101
+ GPU_AVG_UTIL="N/A"
102
+ GPU_TOTAL_MEM="N/A"
103
+ return
104
+ fi
105
+
106
+ local raw
107
+ raw=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
108
+ --format=csv,noheader,nounits 2>/dev/null || echo "")
109
+
110
+ if [[ -z "$raw" ]]; then
111
+ GPU_SUMMARY="GPU query failed"
112
+ GPU_AVG_UTIL="N/A"
113
+ GPU_TOTAL_MEM="N/A"
114
+ return
115
+ fi
116
+
117
+ # avg util
118
+ GPU_AVG_UTIL=$(echo "$raw" | awk -F', ' '{sum+=$2; count++} END {printf "%.0f%%", sum/count}')
119
+
120
+ # total mem used / total
121
+ GPU_TOTAL_MEM=$(echo "$raw" | awk -F', ' \
122
+ '{used+=$3; total+=$4} END {printf "%.1f / %.1f GiB", used/1024, total/1024}')
123
+
124
+ # Per-GPU one-liner: "G0:95% 48G | G1:94% 48G | ..."
125
+ GPU_SUMMARY=$(echo "$raw" | awk -F', ' \
126
+ '{printf "G%s:%s%% %sMiB | ", $1, $2, $3}' | sed 's/ | $//')
127
+ }
128
+
129
+ # ─── Disk usage ──────────────────────────────────────────────────────────────
130
+ get_disk_info() {
131
+ DISK_INFO=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {printf "%s used / %s total (%s)", $3, $2, $5}' || echo "N/A")
132
+ CKPT_COUNT=$(ls -d "$CKPT_DIR"/checkpoint-* 2>/dev/null | wc -l || echo "0")
133
+ LAST_CKPT=$(ls -dt "$CKPT_DIR"/checkpoint-* 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo "none")
134
+ }
135
+
136
+ # ─── Process status ───────────────────────────────────────────────────────────
137
+ get_process_status() {
138
+ PROC_STATUS="UNKNOWN"
139
+ if [[ -f "$PID_FILE" ]]; then
140
+ local pid
141
+ pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
142
+ if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
143
+ PROC_STATUS="RUNNING (PID $pid)"
144
+ else
145
+ PROC_STATUS="STOPPED (PID $pid)"
146
+ fi
147
+ else
148
+ PROC_STATUS="NO PID FILE"
149
+ fi
150
+ }
151
+
152
+ # ─── Build & send message ────────────────────────────────────────────────────
153
+ build_and_send() {
154
+ local step="$CURRENT_STEP"
155
+ local loss="$CURRENT_LOSS"
156
+ local tokps="$CURRENT_TOKPS"
157
+
158
+ # Status icon
159
+ local status_icon
160
+ if [[ "$PROC_STATUS" == RUNNING* ]]; then
161
+ status_icon="&#9989;" # green check
162
+ else
163
+ status_icon="&#10060;" # red X
164
+ fi
165
+
166
+ # Progress bar (20 chars)
167
+ local bar_filled=$(echo "scale=0; $PROGRESS_PCT * 20 / 100" | bc -l 2>/dev/null || echo "0")
168
+ local bar_empty=$(( 20 - bar_filled ))
169
+ PROGRESS_BAR=$(printf '%0.s&#9608;' $(seq 1 $bar_filled 2>/dev/null) ; printf '%0.s&#9617;' $(seq 1 $bar_empty 2>/dev/null)) || PROGRESS_BAR="[$PROGRESS_PCT%]"
170
+
171
+ local msg
172
+ msg="$(cat <<EOF
173
+ <b>FRANKENSTALLM 3B — Hourly Status</b>
174
+ <i>$(ts)</i>
175
+
176
+ $status_icon <b>Process:</b> $PROC_STATUS
177
+
178
+ <b>Progress</b>
179
+ Step: <code>$step / $TOTAL_STEPS</code> ($PROGRESS_PCT%)
180
+ Tokens: <code>${TOKENS_PROCESSED_B}B / ${TOTAL_TOKENS_B}B</code>
181
+ Epoch: <code>$CURRENT_EPOCH</code>
182
+ Last log: <code>$LOG_TS</code>
183
+
184
+ <b>Training Metrics</b>
185
+ Loss: <code>$loss</code>
186
+ LR: <code>$CURRENT_LR</code>
187
+ Gnorm: <code>$CURRENT_GNORM</code>
188
+ Tok/s: <code>$tokps</code>
189
+ Mem: <code>$CURRENT_MEM</code>
190
+
191
+ <b>ETA</b>
192
+ Steps left: <code>$STEPS_LEFT</code>
193
+ Remaining: <code>~$ETA_HOURS h</code>
194
+ Est. done: <code>$ETA_DATETIME</code>
195
+
196
+ <b>GPU</b>
197
+ Avg util: <code>$GPU_AVG_UTIL</code>
198
+ Total mem: <code>$GPU_TOTAL_MEM</code>
199
+
200
+ <b>Checkpoints</b>
201
+ Last saved: <code>$LAST_CKPT</code>
202
+ Total: <code>$CKPT_COUNT</code> checkpoints
203
+
204
+ <b>Disk</b>
205
+ <code>$DISK_INFO</code>
206
+ EOF
207
+ )"
208
+
209
+ log "Sending hourly status report (step $step)..."
210
+ $NOTIFY "$msg" || {
211
+ log "ERROR: Failed to send Telegram message."
212
+ return 1
213
+ }
214
+ log "Status report sent."
215
+ }
216
+
217
+ # ─── Main ────────────────────────────────────────────────────────────────────
218
+ main() {
219
+ log "=== Hourly status START ==="
220
+
221
+ parse_log || {
222
+ log "Cannot parse log — sending minimal status."
223
+ $NOTIFY "<b>FRANKENSTALLM 3B</b> — Status check at $(ts)
224
+
225
+ <b>WARNING:</b> Cannot read training log at:
226
+ <code>$LOG_FILE</code>
227
+
228
+ Process status: $(cat "$PID_FILE" 2>/dev/null && echo "(PID found)" || echo "(no PID file)")" || true
229
+ return 0
230
+ }
231
+
232
+ compute_eta "$CURRENT_STEP" "$CURRENT_TOKPS"
233
+ get_gpu_summary
234
+ get_disk_info
235
+ get_process_status
236
+ build_and_send
237
+
238
+ log "=== Hourly status END ==="
239
+ }
240
+
241
+ main "$@"
source/scripts/launch_3b_orpo.sh ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_3b_orpo.sh — 8-GPU ORPO fine-tuning launcher for Korean 3B LLM
4
+ #
5
+ # Usage:
6
+ # bash scripts/launch_3b_orpo.sh # 기본 실행
7
+ # bash scripts/launch_3b_orpo.sh --max_steps 200 # 빠른 테스트
8
+ # RUN_NAME=my_orpo bash scripts/launch_3b_orpo.sh # 이름 지정
9
+ #
10
+ # 기반 모델 : eval/outputs/hf_3b_sft_best (SFT v1 best)
11
+ # 데이터 : data/preference/combined_preference.jsonl
12
+ # 출력 : checkpoints/korean_3b_orpo_v1/
13
+ # 로그 : checkpoints/korean_3b_orpo_v1/train.log
14
+ #
15
+ # 체크포인트 크기 예상:
16
+ # model weights: ~6GB (bf16)
17
+ # optimizer states: ~24GB
18
+ # 총 ~30GB/개 × max 5개 = 150GB
19
+ # =============================================================================
20
+ set -euo pipefail
21
+
22
+ # ---- Configurable defaults --------------------------------------------------
23
+ RUN_NAME="${RUN_NAME:-korean_3b_orpo_v1}"
24
+ BASE_MODEL="${BASE_MODEL:-eval/outputs/hf_3b_sft_best}"
25
+ DATA_PATH="${DATA_PATH:-data/preference/combined_preference.jsonl}"
26
+ OUTPUT_DIR="checkpoints/${RUN_NAME}"
27
+ CKPT_DIR="checkpoints/${RUN_NAME}"
28
+ LOG_FILE="${CKPT_DIR}/train.log"
29
+ NPROC=8
30
+ MASTER_PORT="${MASTER_PORT:-29502}"
31
+
32
+ # ORPO 하이퍼파라미터
33
+ BATCH_SIZE=4
34
+ GRAD_ACCUM=4
35
+ LR=1.2e-5
36
+ BETA=0.25
37
+ EPOCHS=2
38
+ MAX_LENGTH=1536
39
+ WARMUP_RATIO=0.05
40
+ WEIGHT_DECAY=0.01
41
+ EVAL_SPLIT_RATIO=0.05
42
+ EVAL_STEPS=500
43
+ EARLY_STOPPING_PATIENCE=3
44
+ SAVE_TOTAL_LIMIT=5
45
+ SEED=42
46
+
47
+ EXTRA_ARGS="$@"
48
+
49
+ # ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
50
+ # (launch_3b_pretrain.sh와 동일한 NCCL 설정 유지)
51
+ export NCCL_IB_DISABLE=1
52
+ export NCCL_PROTO=Simple
53
+ export NCCL_MIN_NCHANNELS=16
54
+ export NCCL_MAX_NCHANNELS=16
55
+ # ORPO forward-backward 패스는 pretrain보다 메모리 변동이 크므로 버퍼 128MB 유지
56
+ export NCCL_BUFFSIZE=134217728
57
+ export OMP_NUM_THREADS=9
58
+ export MKL_NUM_THREADS=9
59
+ # OOM 방지: 메모리 단편화 완화 (ORPO는 chosen/rejected 동시 forward → 메모리 민감)
60
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
61
+ # P2P NVLink 직접 통신 활성화
62
+ export NCCL_P2P_LEVEL=NVL
63
+ # Ring + Tree 병행 (3B gradient 크기 기준)
64
+ export NCCL_ALGO=Ring,Tree
65
+
66
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
67
+
68
+ cd "$(dirname "$0")/.."
69
+
70
+ # ---- Pre-flight checks ------------------------------------------------------
71
+ if [[ ! -d "${BASE_MODEL}" ]]; then
72
+ echo "ERROR: 기반 모델 디렉토리 없음: ${BASE_MODEL}"
73
+ echo " SFT 완료 후 HF 포맷으로 변환했는지 확인하세요."
74
+ echo " 예: python scripts/convert_to_hf.py --checkpoint <sft_ckpt> --output ${BASE_MODEL}"
75
+ exit 1
76
+ fi
77
+
78
+ if [[ ! -f "${DATA_PATH}" ]]; then
79
+ echo "ERROR: 학습 데이터 없음: ${DATA_PATH}"
80
+ echo " 먼저 데이터 통합 스크립트를 실행하세요:"
81
+ echo " python data/prepare_preference_combined.py"
82
+ exit 1
83
+ fi
84
+
85
+ if [[ ! -f "train/orpo.py" ]]; then
86
+ echo "ERROR: train/orpo.py 없음"
87
+ exit 1
88
+ fi
89
+
90
+ # GPU 메모리 체크
91
+ GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
92
+ if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 40000 ]]; then
93
+ echo "WARNING: GPU 메모리 ${GPU_MEM}MB < 40GB. ORPO 3B 학습에 부족할 수 있음."
94
+ fi
95
+
96
+ # 중복 프로세스 방지
97
+ EXISTING_PID=$(pgrep -f "orpo.py.*${RUN_NAME}" 2>/dev/null | head -1 || true)
98
+ if [[ -n "$EXISTING_PID" ]]; then
99
+ echo "ERROR: 이미 ORPO 프로세스 실행 중 (PID: ${EXISTING_PID})"
100
+ echo " kill ${EXISTING_PID} 로 먼저 종료하세요."
101
+ exit 1
102
+ fi
103
+
104
+ # 디스크 여유 확인 (최소 200GB)
105
+ AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}' || echo "0")
106
+ if [[ -n "$AVAIL_KB" && "$AVAIL_KB" -gt 0 && "$AVAIL_KB" -lt 209715200 ]]; then
107
+ AVAIL_GB=$(echo "scale=1; $AVAIL_KB / 1048576" | bc 2>/dev/null || echo "?")
108
+ echo "WARNING: /PROJECT 여유 ${AVAIL_GB}GB < 200GB. 체크포인트 저장 공간 부족 가능."
109
+ fi
110
+
111
+ mkdir -p "${CKPT_DIR}" "${OUTPUT_DIR}"
112
+
113
+ # ---- 데이터 레코드 수 확인 --------------------------------------------------
114
+ DATA_LINES=$(wc -l < "${DATA_PATH}" 2>/dev/null || echo "?")
115
+ echo " 학습 데이터 레코드 수: ${DATA_LINES}"
116
+
117
+ # ---- 유효 배치 크기 계산 ----------------------------------------------------
118
+ EFF_BATCH=$((BATCH_SIZE * NPROC * GRAD_ACCUM))
119
+
120
+ echo "=================================================================="
121
+ echo " Korean 3B LLM ORPO Fine-Tuning"
122
+ echo " Run name : ${RUN_NAME}"
123
+ echo " Base model : ${BASE_MODEL}"
124
+ echo " Data : ${DATA_PATH} (${DATA_LINES} records)"
125
+ echo " Output dir : ${OUTPUT_DIR}"
126
+ echo " CKPT dir : ${CKPT_DIR}"
127
+ echo " Log file : ${LOG_FILE}"
128
+ echo " Epochs : ${EPOCHS}"
129
+ echo " LR : ${LR}"
130
+ echo " Beta (ORPO) : ${BETA}"
131
+ echo " Batch : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} accum = ${EFF_BATCH}"
132
+ echo " Max length : ${MAX_LENGTH}"
133
+ echo " Weight decay : ${WEIGHT_DECAY}"
134
+ echo " Eval steps : ${EVAL_STEPS}"
135
+ echo " Early stop : patience=${EARLY_STOPPING_PATIENCE}"
136
+ echo " Started : $(date)"
137
+ echo "=================================================================="
138
+
139
+ torchrun \
140
+ --nproc_per_node=${NPROC} \
141
+ --master_port=${MASTER_PORT} \
142
+ train/orpo.py \
143
+ --model_path "${BASE_MODEL}" \
144
+ --custom_data_path "${DATA_PATH}" \
145
+ --output_dir "${OUTPUT_DIR}" \
146
+ --epochs ${EPOCHS} \
147
+ --lr ${LR} \
148
+ --beta ${BETA} \
149
+ --batch_size ${BATCH_SIZE} \
150
+ --gradient_accumulation_steps ${GRAD_ACCUM} \
151
+ --max_length ${MAX_LENGTH} \
152
+ --weight_decay ${WEIGHT_DECAY} \
153
+ --eval_split_ratio ${EVAL_SPLIT_RATIO} \
154
+ --eval_steps ${EVAL_STEPS} \
155
+ --early_stopping_patience ${EARLY_STOPPING_PATIENCE} \
156
+ --save_total_limit ${SAVE_TOTAL_LIMIT} \
157
+ ${EXTRA_ARGS} \
158
+ 2>&1 | tee "${LOG_FILE}" \
159
+ | grep -v "UserWarning" \
160
+ | grep -v "Warning only once" \
161
+ | grep -v "Overriding a previously" \
162
+ | grep -v "dispatch key:" \
163
+ | grep -v "previous kernel:" \
164
+ | grep -v "new kernel:" \
165
+ | grep -v "operator: flash_attn" \
166
+ | grep -v "registered at /usr/local" \
167
+ | grep -v "self.m.impl"
168
+
169
+ EXIT_CODE=$?
170
+ echo "=================================================================="
171
+ echo " Done : $(date)"
172
+ echo " Exit code: ${EXIT_CODE}"
173
+ if [[ "${EXIT_CODE}" -eq 0 ]]; then
174
+ echo " 모델 저장 위치: ${OUTPUT_DIR}"
175
+ fi
176
+ echo "=================================================================="
177
+ exit $EXIT_CODE
source/scripts/launch_3b_pretrain.sh ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_3b_pretrain.sh — 8-GPU FP8 pretraining launcher for Korean 3B LLM
4
+ #
5
+ # Features:
6
+ # - SIGHUP 방어: SSH 끊김 시 자동으로 nohup+setsid로 세션 보호
7
+ # - Graceful shutdown: SIGTERM 시 Python 시그널 핸들러가 비상 체크포인트 저장
8
+ # - 자동 resume: 최신 체크포인트에서 자동 재개
9
+ # - PID 파일: 프로세스 모니터링 및 제어용
10
+ # - grep 파이프라인 exit code 보호 (|| true)
11
+ #
12
+ # Usage:
13
+ # bash scripts/launch_3b_pretrain.sh # full run (60B tokens)
14
+ # bash scripts/launch_3b_pretrain.sh --max_steps 500 # quick test
15
+ # bash scripts/launch_3b_pretrain.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-0010000
16
+ # MAX_STEPS=95000 bash scripts/launch_3b_pretrain.sh # 100B tokens
17
+ #
18
+ # 모니터링:
19
+ # tail -f checkpoints/korean_3b_fp8_run1/train.log
20
+ # cat checkpoints/korean_3b_fp8_run1/train.pid
21
+ #
22
+ # 중지 (비상 체크포인트 자동 저장):
23
+ # kill $(cat checkpoints/korean_3b_fp8_run1/train.pid)
24
+ #
25
+ # 강제 종료 (체크포인트 저장 없음):
26
+ # kill -9 $(cat checkpoints/korean_3b_fp8_run1/train.pid)
27
+ # =============================================================================
28
+
29
+ # -u: 미정의 변수 에러
30
+ # NOTE: -e, -o pipefail 의도적 제거
31
+ # 이전 문제: grep 파이프라인에서 모든 라인이 필터링되면 exit code 1 반환
32
+ # → pipefail이 이를 스크립트 실패로 전파 → 학습 중단
33
+ # 해결: set -e/pipefail 제거 + grep 체인에 || true 추가
34
+ set -u
35
+
36
+ # ---- Configurable defaults --------------------------------------------------
37
+ RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
38
+ CONFIG="${CONFIG:-configs/korean_3b_fp8.yaml}"
39
+ TRAIN_DATA="${TRAIN_DATA:-data/3b_train.bin}"
40
+ VAL_DATA="${VAL_DATA:-data/3b_val.bin}"
41
+ CKPT_DIR="checkpoints/${RUN_NAME}"
42
+ LOG_FILE="${CKPT_DIR}/train.log"
43
+ NPROC=8
44
+ MASTER_PORT="${MASTER_PORT:-29501}"
45
+
46
+ MAX_STEPS="${MAX_STEPS:-57000}"
47
+ BATCH_SIZE=5
48
+ GRAD_ACCUM=8
49
+ WARMUP_STEPS=2000
50
+ SEED=42
51
+
52
+ # ---- B200 / NVSwitch single-node NCCL tuning (3B optimized, v2) ----------
53
+ export NCCL_IB_DISABLE=1
54
+ export NCCL_ALGO=NVLS,Ring # NVSwitch hardware reduction first (was Ring,Tree)
55
+ export NCCL_PROTO=Simple
56
+ export NCCL_NVLS_ENABLE=1 # NVLink SHARP — hardware-accelerated all-reduce
57
+ export NCCL_MIN_NCHANNELS=32 # raise minimum for NVSwitch headroom (was 16)
58
+ export NCCL_MAX_NCHANNELS=32
59
+ export NCCL_BUFFSIZE=268435456 # 256MB (was 128MB) — reduces bucket pipeline stalls
60
+ export NCCL_P2P_LEVEL=NVL
61
+ export NCCL_NET_GDR_LEVEL=0
62
+ export OMP_NUM_THREADS=4
63
+ export MKL_NUM_THREADS=4
64
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
65
+ # Triton/Inductor cache on executable filesystem (not /tmp which is noexec)
66
+ export TRITON_CUDACRT_PATH=/usr/local/cuda/include
67
+ export TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
68
+
69
+ cd "$(dirname "$0")/.."
70
+
71
+ mkdir -p "${CKPT_DIR}"
72
+
73
+ # ---- Session protection (SIGHUP 방어) ---------------------------------------
74
+ # tmux/screen 없이 실행 시, 자동으로 nohup + setsid로 래핑하여
75
+ # SSH 끊김(SIGHUP)으로부터 학습 프로세스를 보호합니다.
76
+ #
77
+ # 작동 원리:
78
+ # 1. tmux/screen/이미 보호됨 여부 확인
79
+ # 2. 미보호 상태이면 _LAUNCH_PROTECTED=1 설정 후 nohup setsid로 자기 자신을 재실행
80
+ # 3. 재실행된 프로세스는 새로운 세션 리더가 되어 터미널과 분리됨
81
+ # 4. 원래 셸은 PID와 모니터링 명령을 출력하고 즉시 종료
82
+ PID_FILE="${CKPT_DIR}/train.pid"
83
+
84
+ if [[ -z "${_LAUNCH_PROTECTED:-}" ]] && [[ -z "${TMUX:-}" ]] && [[ -z "${STY:-}" ]]; then
85
+ export _LAUNCH_PROTECTED=1
86
+ NOHUP_LOG="${CKPT_DIR}/launch_$(date +%Y%m%d_%H%M%S).log"
87
+
88
+ echo "=================================================================="
89
+ echo " SIGHUP PROTECTION ACTIVATED"
90
+ echo " tmux/screen 미감지 → 세션 보호 모드 자동 활성화 (nohup + setsid)"
91
+ echo " SSH 끊어져도 학습이 계속됩니다."
92
+ echo "=================================================================="
93
+ echo ""
94
+
95
+ # 자기 자신을 세션 보호 모드로 재실행
96
+ nohup setsid bash "$0" "$@" > "${NOHUP_LOG}" 2>&1 &
97
+ BG_PID=$!
98
+ echo "${BG_PID}" > "${PID_FILE}"
99
+
100
+ echo " PID : ${BG_PID}"
101
+ echo " PID 파일 : ${PID_FILE}"
102
+ echo " Launch 로그 : ${NOHUP_LOG}"
103
+ echo " 학습 로그 : ${LOG_FILE}"
104
+ echo ""
105
+ echo " 모니터링:"
106
+ echo " tail -f ${LOG_FILE}"
107
+ echo ""
108
+ echo " 중지 (비상 체크포인트 자동 저장):"
109
+ echo " kill \$(cat ${PID_FILE})"
110
+ echo ""
111
+ echo " 강제 종료:"
112
+ echo " kill -9 \$(cat ${PID_FILE})"
113
+ echo "=================================================================="
114
+ exit 0
115
+ fi
116
+
117
+ # ---- Cleanup on exit --------------------------------------------------------
118
+ PREWARM_PID=""
119
+
120
+ cleanup() {
121
+ rm -f "${PID_FILE}" 2>/dev/null || true
122
+ if [[ -n "${PREWARM_PID:-}" ]]; then
123
+ kill "${PREWARM_PID}" 2>/dev/null || true
124
+ fi
125
+ }
126
+ trap cleanup EXIT
127
+
128
+ # PID 파일 기록 (tmux/screen 내에서 실행 시에도 PID 추적 가능)
129
+ echo "$$" > "${PID_FILE}"
130
+
131
+ # ---- Pre-flight checks ------------------------------------------------------
132
+ if [[ ! -f "${CONFIG}" ]]; then
133
+ echo "[ERROR] Config not found: ${CONFIG}"
134
+ exit 1
135
+ fi
136
+
137
+ if [[ ! -f "${TRAIN_DATA}" ]]; then
138
+ echo "[ERROR] Training data not found: ${TRAIN_DATA}"
139
+ exit 1
140
+ fi
141
+
142
+ # GPU 메모리 체크 (3B는 최소 80GB/GPU 권장, B200=192GB → OK)
143
+ GPU_MEM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 || echo "0")
144
+ if [[ "$GPU_MEM" -gt 0 && "$GPU_MEM" -lt 80000 ]]; then
145
+ echo "[WARN] GPU memory ${GPU_MEM}MB < 80GB. 3B 학습에 부족할 수 있음."
146
+ fi
147
+
148
+ # 중복 프로세스 방지
149
+ EXISTING_PID=$(pgrep -f "pretrain.py.*korean_3b" 2>/dev/null | head -1 || true)
150
+ if [[ -n "$EXISTING_PID" ]]; then
151
+ echo "[ERROR] 이미 3B pretrain 프로세스 실행 중 (PID: ${EXISTING_PID})"
152
+ echo " kill ${EXISTING_PID} 로 먼저 종료하세요."
153
+ exit 1
154
+ fi
155
+
156
+ # 디스크 여유 확인 (최소 1TB 필요)
157
+ AVAIL_KB=$(df /PROJECT 2>/dev/null | awk 'NR==2{print $4}')
158
+ if [[ -n "${AVAIL_KB:-}" ]] && [[ "$AVAIL_KB" -lt 1073741824 ]]; then
159
+ AVAIL_TB=$(echo "scale=1; $AVAIL_KB / 1073741824" | bc 2>/dev/null || echo "?")
160
+ echo "[WARN] /PROJECT 여유 ${AVAIL_TB}TB < 1TB. 체크포인트 저장 공간 부족 가능."
161
+ fi
162
+
163
+ # ---- Resume detection -------------------------------------------------------
164
+ RESUME_ARG=""
165
+ EXTRA_ARGS="${*:-}"
166
+ if [[ ! "${EXTRA_ARGS}" =~ "--resume" ]]; then
167
+ # 가장 최근 체크포인트 자동 감지
168
+ LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
169
+ if [[ -n "$LATEST_CKPT" ]]; then
170
+ echo "[INFO] 자동 resume 감지: ${LATEST_CKPT}"
171
+ RESUME_ARG="--resume ${LATEST_CKPT}"
172
+ fi
173
+ fi
174
+
175
+ # ---- Banner ------------------------------------------------------------------
176
+ SESSION_TYPE="direct"
177
+ [[ -n "${TMUX:-}" ]] && SESSION_TYPE="tmux"
178
+ [[ -n "${STY:-}" ]] && SESSION_TYPE="screen"
179
+ [[ -n "${_LAUNCH_PROTECTED:-}" ]] && SESSION_TYPE="protected (nohup+setsid)"
180
+
181
+ echo "=================================================================="
182
+ echo " Korean 3B LLM Pre-Training (FP8)"
183
+ echo " Run name : ${RUN_NAME}"
184
+ echo " Config : ${CONFIG}"
185
+ echo " CKPT dir : ${CKPT_DIR}"
186
+ echo " Log file : ${LOG_FILE}"
187
+ echo " Max steps : ${MAX_STEPS}"
188
+ echo " Batch : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} accum"
189
+ echo " Eff tokens : $((BATCH_SIZE * NPROC * GRAD_ACCUM * 4096)) tokens/step (~1M)"
190
+ echo " Total tokens: ~$((MAX_STEPS * BATCH_SIZE * NPROC * GRAD_ACCUM * 4096 / 1000000000))B"
191
+ echo " Resume : ${RESUME_ARG:-none (fresh start)}"
192
+ echo " Session : ${SESSION_TYPE}"
193
+ echo " PID : $$ (file: ${PID_FILE})"
194
+ echo " Started : $(date)"
195
+ echo "=================================================================="
196
+
197
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
198
+
199
+ # ---- Pre-warm OS page cache (NUMA-interleaved, non-blocking) ---------------
200
+ if [[ -f "${TRAIN_DATA}" ]]; then
201
+ echo "[INFO] Pre-warming page cache for ${TRAIN_DATA} (NUMA interleaved)..."
202
+ numactl --interleave=all dd if="${TRAIN_DATA}" of=/dev/null bs=16M 2>/dev/null &
203
+ PREWARM_PID=$!
204
+ fi
205
+
206
+ # ---- Launch training ---------------------------------------------------------
207
+ # grep 파이프라인 보호:
208
+ # 문제: grep -v 가 매칭 라인이 없으면 exit code 1 반환
209
+ # 해결: { ... || true; } 래핑으로 파이프라인 exit code를 항상 0으로 보장
210
+ # torchrun의 실제 exit code는 PIPESTATUS[0]으로 별도 캡처
211
+ numactl --interleave=all \
212
+ torchrun \
213
+ --nproc_per_node=${NPROC} \
214
+ --master_port=${MASTER_PORT} \
215
+ train/pretrain.py \
216
+ --config "${CONFIG}" \
217
+ --train_data "${TRAIN_DATA}" \
218
+ --val_data "${VAL_DATA}" \
219
+ --checkpoint_dir "${CKPT_DIR}" \
220
+ --log_file "${LOG_FILE}" \
221
+ --max_steps ${MAX_STEPS} \
222
+ --batch_size ${BATCH_SIZE} \
223
+ --grad_accum ${GRAD_ACCUM} \
224
+ --warmup_steps ${WARMUP_STEPS} \
225
+ --seed ${SEED} \
226
+ ${RESUME_ARG} \
227
+ ${EXTRA_ARGS} \
228
+ 2>&1 | { grep -v "UserWarning" \
229
+ | grep -v "Warning only once" \
230
+ | grep -v "Overriding a previously" \
231
+ | grep -v "dispatch key:" \
232
+ | grep -v "previous kernel:" \
233
+ | grep -v "new kernel:" \
234
+ | grep -v "operator: flash_attn" \
235
+ | grep -v "registered at /usr/local" \
236
+ | grep -v "self.m.impl" \
237
+ || true; }
238
+
239
+ EXIT_CODE=${PIPESTATUS[0]}
240
+
241
+ # ---- Exit summary ------------------------------------------------------------
242
+ echo ""
243
+ echo "=================================================================="
244
+ echo " Finished : $(date)"
245
+ echo " Exit code : ${EXIT_CODE}"
246
+ if [[ ${EXIT_CODE} -eq 0 ]]; then
247
+ echo " Status : SUCCESS (학습 완료 또는 graceful shutdown)"
248
+ elif [[ ${EXIT_CODE} -eq 143 ]]; then
249
+ echo " Status : TERMINATED (SIGTERM — 비상 체크포인트 저장됨)"
250
+ elif [[ ${EXIT_CODE} -eq 137 ]]; then
251
+ echo " Status : KILLED (SIGKILL — 강제 종료, 체크포인트 미저장)"
252
+ elif [[ ${EXIT_CODE} -eq 1 ]]; then
253
+ echo " Status : ERROR (${LOG_FILE} 확인 필요)"
254
+ else
255
+ echo " Status : FAILED (exit code ${EXIT_CODE}, ${LOG_FILE} 확인)"
256
+ fi
257
+ echo "=================================================================="
258
+ exit ${EXIT_CODE}
source/scripts/launch_3b_sft.sh ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_3b_sft.sh — 8-GPU FP8 SFT launcher for 3B Korean LLM
4
+ #
5
+ # Usage:
6
+ # bash scripts/launch_3b_sft.sh
7
+ # bash scripts/launch_3b_sft.sh --max_steps 200 # quick test
8
+ # bash scripts/launch_3b_sft.sh --resume checkpoints/korean_3b_sft_v1/checkpoint-0002000
9
+ #
10
+ # Base model : checkpoints/korean_3b_fp8_run1/checkpoint-XXXXXX (기본값)
11
+ # --base_checkpoint 인자로 덮어쓸 수 있음
12
+ # SFT data : data/sft_combined/train_filtered.jsonl
13
+ # (먼저 scripts/prepare_sft_combined.sh → data/filter_sft_v2.py 실행)
14
+ #
15
+ # Effective batch: 2 (local) × 8 GPU × 4 (grad_accum) = 64 samples/step
16
+ # =============================================================================
17
+ set -euo pipefail
18
+
19
+ # ---- Configurable defaults --------------------------------------------------
20
+ RUN_NAME="${RUN_NAME:-korean_3b_sft_v1}"
21
+ CONFIG="${CONFIG:-configs/korean_3b_sft.yaml}"
22
+ BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_3b_fp8_run1/checkpoint-0057000}"
23
+ SFT_DATA="${SFT_DATA:-data/sft_combined/train_filtered.jsonl}"
24
+ VAL_DATA="${VAL_DATA:-data/sft_combined/val_filtered.jsonl}"
25
+ CKPT_DIR="checkpoints/${RUN_NAME}"
26
+ LOG_FILE="${CKPT_DIR}/train.log"
27
+ NPROC=8
28
+ MASTER_PORT="${MASTER_PORT:-29503}"
29
+
30
+ MAX_STEPS=33000
31
+ BATCH_SIZE=2
32
+ GRAD_ACCUM=4
33
+ LR="1.0e-5"
34
+ WARMUP_STEPS=500
35
+ SEED=42
36
+
37
+ EXTRA_ARGS="$@"
38
+
39
+ # ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
40
+ export NCCL_IB_DISABLE=1
41
+ export NCCL_ALGO=Ring
42
+ export NCCL_PROTO=Simple
43
+ export NCCL_MIN_NCHANNELS=16
44
+ export NCCL_MAX_NCHANNELS=16
45
+ export NCCL_BUFFSIZE=67108864
46
+ export OMP_NUM_THREADS=4
47
+ export MKL_NUM_THREADS=4
48
+
49
+ # 3B 모델 VRAM 절약 — 동적 메모리 세그먼트 확장 허용
50
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
51
+
52
+ cd "$(dirname "$0")/.."
53
+
54
+ # ---- Pre-flight checks ------------------------------------------------------
55
+ if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
56
+ echo "=================================================================="
57
+ echo " ERROR: Base checkpoint 디렉토리를 찾을 수 없습니다."
58
+ echo " 경로: ${BASE_CHECKPOINT}"
59
+ echo ""
60
+ echo " --base_checkpoint 인자로 실제 경로를 지정하거나"
61
+ echo " BASE_CHECKPOINT 환경변수를 설정하세요."
62
+ echo " 예: bash scripts/launch_3b_sft.sh --base_checkpoint checkpoints/korean_3b_fp8_run1/checkpoint-0057000"
63
+ echo "=================================================================="
64
+ exit 1
65
+ fi
66
+
67
+ if [[ ! -f "${SFT_DATA}" ]]; then
68
+ echo "=================================================================="
69
+ echo " ERROR: SFT 학습 데이터를 찾을 수 없습니다: ${SFT_DATA}"
70
+ echo ""
71
+ echo " 데이터 준비 순서:"
72
+ echo " 1. bash scripts/prepare_sft_combined.sh"
73
+ echo " 2. python data/filter_sft_v2.py \\"
74
+ echo " --input data/sft_combined/train.jsonl \\"
75
+ echo " --output data/sft_combined/train_filtered.jsonl"
76
+ echo "=================================================================="
77
+ exit 1
78
+ fi
79
+
80
+ # val 파일 없으면 원본 val.jsonl 로 폴백
81
+ if [[ ! -f "${VAL_DATA}" ]]; then
82
+ VAL_FALLBACK="data/sft_combined/val.jsonl"
83
+ if [[ -f "${VAL_FALLBACK}" ]]; then
84
+ VAL_DATA="${VAL_FALLBACK}"
85
+ echo "[INFO] val_filtered 없음, 폴백: ${VAL_DATA}"
86
+ else
87
+ echo "ERROR: VAL_DATA 파일을 찾을 수 없습니다: ${VAL_DATA}"
88
+ exit 1
89
+ fi
90
+ fi
91
+
92
+ mkdir -p "${CKPT_DIR}"
93
+
94
+ echo "=================================================================="
95
+ echo " 3B SFT Fine-Tuning"
96
+ echo " Run name : ${RUN_NAME}"
97
+ echo " Config : ${CONFIG}"
98
+ echo " Base checkpoint : ${BASE_CHECKPOINT}"
99
+ echo " SFT data : ${SFT_DATA}"
100
+ echo " Val data : ${VAL_DATA}"
101
+ echo " CKPT dir : ${CKPT_DIR}"
102
+ echo " Log file : ${LOG_FILE}"
103
+ echo " Max steps : ${MAX_STEPS}"
104
+ echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum = $((BATCH_SIZE * NPROC * GRAD_ACCUM)) eff_batch"
105
+ echo " Learning rate : ${LR}"
106
+ echo " Warmup : ${WARMUP_STEPS} steps"
107
+ echo " Master port : ${MASTER_PORT}"
108
+ echo " ALLOC_CONF : ${PYTORCH_CUDA_ALLOC_CONF}"
109
+ echo " Started : $(date)"
110
+ echo "=================================================================="
111
+
112
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
113
+
114
+ torchrun \
115
+ --nproc_per_node=${NPROC} \
116
+ --master_port=${MASTER_PORT} \
117
+ train/sft.py \
118
+ --config "${CONFIG}" \
119
+ --base_checkpoint "${BASE_CHECKPOINT}" \
120
+ --sft_data "${SFT_DATA}" \
121
+ --val_data "${VAL_DATA}" \
122
+ --checkpoint_dir "${CKPT_DIR}" \
123
+ --log_file "${LOG_FILE}" \
124
+ --max_steps ${MAX_STEPS} \
125
+ --batch_size ${BATCH_SIZE} \
126
+ --grad_accum ${GRAD_ACCUM} \
127
+ --lr ${LR} \
128
+ --warmup_steps ${WARMUP_STEPS} \
129
+ --seed ${SEED} \
130
+ --use_fp8 \
131
+ ${EXTRA_ARGS} \
132
+ 2>&1 | grep -v "UserWarning" \
133
+ | grep -v "Warning only once" \
134
+ | grep -v "Overriding a previously" \
135
+ | grep -v "dispatch key:" \
136
+ | grep -v "previous kernel:" \
137
+ | grep -v "new kernel:" \
138
+ | grep -v "operator: flash_attn" \
139
+ | grep -v "registered at /usr/local" \
140
+ | grep -v "self.m.impl" \
141
+ | tee -a "${LOG_FILE}"
142
+
143
+ echo "=================================================================="
144
+ echo " 3B SFT Done : $(date)"
145
+ echo "=================================================================="
source/scripts/launch_3b_sft_v2.sh ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_3b_sft_v2.sh — 8-GPU FP8 SFT v2 launcher for 3B Korean LLM
4
+ #
5
+ # SFT v2 improvements over v1:
6
+ # - LR: 1e-5 → 5e-5 (5x, resolve underfitting)
7
+ # - Effective batch: 64 → 256 (4x)
8
+ # - Data mixing: 70% SFT + 30% pretrain (forgetting prevention)
9
+ # - Weight decay: 0.01 → 0.05
10
+ # - Warmup: 500 → 2000 steps
11
+ # - Max steps: 33000 → 15000
12
+ #
13
+ # Usage:
14
+ # bash scripts/launch_3b_sft_v2.sh
15
+ # bash scripts/launch_3b_sft_v2.sh --max_steps 200 # quick test
16
+ # bash scripts/launch_3b_sft_v2.sh --resume checkpoints/korean_3b_sft_v2/checkpoint-0002000
17
+ #
18
+ # Effective batch: 4 (local) x 8 GPU x 8 (grad_accum) = 256 samples/step
19
+ # =============================================================================
20
+ set -euo pipefail
21
+
22
+ # ---- Configurable defaults --------------------------------------------------
23
+ RUN_NAME="${RUN_NAME:-korean_3b_sft_v2}"
24
+ CONFIG="${CONFIG:-configs/korean_3b_sft_v2.yaml}"
25
+ BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_3b_fp8_run1/checkpoint-0057000}"
26
+ SFT_DATA="${SFT_DATA:-data/sft_combined/train_filtered.jsonl}"
27
+ VAL_DATA="${VAL_DATA:-data/sft_combined/val_filtered.jsonl}"
28
+ PRETRAIN_DATA="${PRETRAIN_DATA:-data/3b_train.bin}"
29
+ CKPT_DIR="checkpoints/${RUN_NAME}"
30
+ LOG_FILE="${CKPT_DIR}/train.log"
31
+ NPROC=8
32
+ MASTER_PORT="${MASTER_PORT:-29504}"
33
+
34
+ MAX_STEPS=15000
35
+ BATCH_SIZE=4
36
+ GRAD_ACCUM=8
37
+ LR="5.0e-5"
38
+ WARMUP_STEPS=2000
39
+ WEIGHT_DECAY=0.05
40
+ PRETRAIN_MIX_RATIO=0.3
41
+ SEED=42
42
+
43
+ EXTRA_ARGS="$@"
44
+
45
+ # ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
46
+ export NCCL_IB_DISABLE=1
47
+ export NCCL_ALGO=Ring
48
+ export NCCL_PROTO=Simple
49
+ export NCCL_MIN_NCHANNELS=16
50
+ export NCCL_MAX_NCHANNELS=16
51
+ export NCCL_BUFFSIZE=67108864
52
+ export OMP_NUM_THREADS=4
53
+ export MKL_NUM_THREADS=4
54
+
55
+ # 3B + bs=4 VRAM allocation
56
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
57
+
58
+ cd "$(dirname "$0")/.."
59
+
60
+ # ---- Pre-flight checks ------------------------------------------------------
61
+ if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
62
+ echo "=================================================================="
63
+ echo " ERROR: Base checkpoint not found: ${BASE_CHECKPOINT}"
64
+ echo " Set BASE_CHECKPOINT env var or use --base_checkpoint CLI arg."
65
+ echo "=================================================================="
66
+ exit 1
67
+ fi
68
+
69
+ if [[ ! -f "${SFT_DATA}" ]]; then
70
+ echo "=================================================================="
71
+ echo " ERROR: SFT data not found: ${SFT_DATA}"
72
+ echo " Run: bash scripts/prepare_sft_combined.sh"
73
+ echo "=================================================================="
74
+ exit 1
75
+ fi
76
+
77
+ if [[ ! -f "${PRETRAIN_DATA}" ]]; then
78
+ echo "=================================================================="
79
+ echo " ERROR: Pretrain data not found: ${PRETRAIN_DATA}"
80
+ echo " Set PRETRAIN_DATA env var to the correct path."
81
+ echo "=================================================================="
82
+ exit 1
83
+ fi
84
+
85
+ # val fallback
86
+ if [[ ! -f "${VAL_DATA}" ]]; then
87
+ VAL_FALLBACK="data/sft_combined/val.jsonl"
88
+ if [[ -f "${VAL_FALLBACK}" ]]; then
89
+ VAL_DATA="${VAL_FALLBACK}"
90
+ echo "[INFO] val_filtered not found, fallback: ${VAL_DATA}"
91
+ else
92
+ echo "ERROR: VAL_DATA not found: ${VAL_DATA}"
93
+ exit 1
94
+ fi
95
+ fi
96
+
97
+ mkdir -p "${CKPT_DIR}"
98
+
99
+ echo "=================================================================="
100
+ echo " 3B SFT v2 Fine-Tuning"
101
+ echo " Run name : ${RUN_NAME}"
102
+ echo " Config : ${CONFIG}"
103
+ echo " Base checkpoint : ${BASE_CHECKPOINT}"
104
+ echo " SFT data : ${SFT_DATA}"
105
+ echo " Pretrain data : ${PRETRAIN_DATA}"
106
+ echo " Val data : ${VAL_DATA}"
107
+ echo " CKPT dir : ${CKPT_DIR}"
108
+ echo " Log file : ${LOG_FILE}"
109
+ echo " Max steps : ${MAX_STEPS}"
110
+ echo " Batch size : ${BATCH_SIZE} (local) x ${NPROC} GPU x ${GRAD_ACCUM} grad_accum = $((BATCH_SIZE * NPROC * GRAD_ACCUM)) eff_batch"
111
+ echo " Learning rate : ${LR}"
112
+ echo " Weight decay : ${WEIGHT_DECAY}"
113
+ echo " Warmup : ${WARMUP_STEPS} steps"
114
+ echo " Data mixing : $((100 - ${PRETRAIN_MIX_RATIO%.*}0))% SFT + ${PRETRAIN_MIX_RATIO}00% pretrain"
115
+ echo " Master port : ${MASTER_PORT}"
116
+ echo " ALLOC_CONF : ${PYTORCH_CUDA_ALLOC_CONF}"
117
+ echo " Started : $(date)"
118
+ echo "=================================================================="
119
+
120
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
121
+
122
+ torchrun \
123
+ --nproc_per_node=${NPROC} \
124
+ --master_port=${MASTER_PORT} \
125
+ train/sft.py \
126
+ --config "${CONFIG}" \
127
+ --base_checkpoint "${BASE_CHECKPOINT}" \
128
+ --sft_data "${SFT_DATA}" \
129
+ --val_data "${VAL_DATA}" \
130
+ --pretrain_data "${PRETRAIN_DATA}" \
131
+ --pretrain_mix_ratio ${PRETRAIN_MIX_RATIO} \
132
+ --checkpoint_dir "${CKPT_DIR}" \
133
+ --log_file "${LOG_FILE}" \
134
+ --max_steps ${MAX_STEPS} \
135
+ --batch_size ${BATCH_SIZE} \
136
+ --grad_accum ${GRAD_ACCUM} \
137
+ --lr ${LR} \
138
+ --weight_decay ${WEIGHT_DECAY} \
139
+ --warmup_steps ${WARMUP_STEPS} \
140
+ --seed ${SEED} \
141
+ --use_fp8 \
142
+ ${EXTRA_ARGS} \
143
+ 2>&1 | grep -v "UserWarning" \
144
+ | grep -v "Warning only once" \
145
+ | grep -v "Overriding a previously" \
146
+ | grep -v "dispatch key:" \
147
+ | grep -v "previous kernel:" \
148
+ | grep -v "new kernel:" \
149
+ | grep -v "operator: flash_attn" \
150
+ | grep -v "registered at /usr/local" \
151
+ | grep -v "self.m.impl" \
152
+ | tee -a "${LOG_FILE}"
153
+
154
+ echo "=================================================================="
155
+ echo " 3B SFT v2 Done : $(date)"
156
+ echo "=================================================================="
source/scripts/launch_fp8.sh ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_fp8.sh — 8-GPU FP8 pretraining launcher for B200
4
+ #
5
+ # Usage:
6
+ # bash scripts/launch_fp8.sh # full run
7
+ # bash scripts/launch_fp8.sh --max_steps 500 # quick test
8
+ # bash scripts/launch_fp8.sh --resume checkpoints/small_fp8_run1/checkpoint-0001000
9
+ #
10
+ # Config is read from configs/small_fp8.yaml (model) + CLI args (train).
11
+ # Logs: checkpoints/<RUN_NAME>/train.log
12
+ # checkpoints/<RUN_NAME>/tensorboard/
13
+ # =============================================================================
14
+ set -euo pipefail
15
+
16
+ # ---- Configurable defaults --------------------------------------------------
17
+ RUN_NAME="${RUN_NAME:-small_fp8_run1}"
18
+ CONFIG="${CONFIG:-configs/small_fp8.yaml}"
19
+ TRAIN_DATA="${TRAIN_DATA:-data/train.bin}"
20
+ VAL_DATA="${VAL_DATA:-data/val.bin}"
21
+ CKPT_DIR="checkpoints/${RUN_NAME}"
22
+ LOG_FILE="${CKPT_DIR}/train.log"
23
+ NPROC=8
24
+ MASTER_PORT="${MASTER_PORT:-29500}"
25
+
26
+ # ---- Defaults that can be overridden via extra CLI args --------------------
27
+ MAX_STEPS=100000
28
+ BATCH_SIZE=8
29
+ GRAD_ACCUM=4
30
+ WARMUP_STEPS=2000
31
+ SEED=42
32
+
33
+ # ---- Pass remaining CLI args directly to pretrain.py ----------------------
34
+ EXTRA_ARGS="$@"
35
+
36
+ # ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
37
+ # Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
38
+ export NCCL_IB_DISABLE=1
39
+ # Use Ring algorithm for large gradient tensors (128M-70B model range).
40
+ export NCCL_ALGO=Ring
41
+ # Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
42
+ export NCCL_PROTO=Simple
43
+ # More channels → better NVSwitch saturation for large all-reduce payloads.
44
+ export NCCL_MIN_NCHANNELS=16
45
+ export NCCL_MAX_NCHANNELS=16
46
+ # Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
47
+ export NCCL_BUFFSIZE=67108864
48
+ # CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
49
+ export OMP_NUM_THREADS=4
50
+ export MKL_NUM_THREADS=4
51
+
52
+ # ---- Setup ------------------------------------------------------------------
53
+ mkdir -p "${CKPT_DIR}"
54
+ cd "$(dirname "$0")/.." # always run from project root
55
+
56
+ echo "=================================================================="
57
+ echo " Run name : ${RUN_NAME}"
58
+ echo " Config : ${CONFIG}"
59
+ echo " CKPT dir : ${CKPT_DIR}"
60
+ echo " Log file : ${LOG_FILE}"
61
+ echo " Started : $(date)"
62
+ echo "=================================================================="
63
+
64
+ # Suppress the harmless flash_attn kernel override warning from all ranks.
65
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
66
+
67
+ torchrun \
68
+ --nproc_per_node=${NPROC} \
69
+ --master_port=${MASTER_PORT} \
70
+ train/pretrain.py \
71
+ --config "${CONFIG}" \
72
+ --train_data "${TRAIN_DATA}" \
73
+ --val_data "${VAL_DATA}" \
74
+ --checkpoint_dir "${CKPT_DIR}" \
75
+ --log_file "${LOG_FILE}" \
76
+ --max_steps ${MAX_STEPS} \
77
+ --batch_size ${BATCH_SIZE} \
78
+ --grad_accum ${GRAD_ACCUM} \
79
+ --warmup_steps ${WARMUP_STEPS} \
80
+ --seed ${SEED} \
81
+ ${EXTRA_ARGS} \
82
+ 2>&1 | grep -v "UserWarning" \
83
+ | grep -v "Warning only once" \
84
+ | grep -v "Overriding a previously" \
85
+ | grep -v "dispatch key:" \
86
+ | grep -v "previous kernel:" \
87
+ | grep -v "new kernel:" \
88
+ | grep -v "operator: flash_attn" \
89
+ | grep -v "registered at /usr/local" \
90
+ | grep -v "self.m.impl"
91
+
92
+ echo "=================================================================="
93
+ echo " Done : $(date)"
94
+ echo "=================================================================="
source/scripts/launch_hybrid_3b.sh ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ============================================================================
3
+ # FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer 학습 런치 스크립트
4
+ # ============================================================================
5
+ #
6
+ # 사용법:
7
+ # nohup setsid bash scripts/launch_hybrid_3b.sh > logs/hybrid_3b.log 2>&1 &
8
+ #
9
+ # SIGHUP 방어: nohup + setsid 조합으로 SSH 끊김에도 학습 유지
10
+ # ============================================================================
11
+
12
+ set -euo pipefail
13
+
14
+ # ---- 환경 변수 ----
15
+ export OMP_NUM_THREADS=4
16
+ export NCCL_ALGO=NVLS # NVSwitch 최적 알고리즘
17
+ export NCCL_IB_DISABLE=1 # InfiniBand 비활성 (단일 노드)
18
+ export NCCL_P2P_LEVEL=NVL # NVLink P2P
19
+ export NCCL_NET_GDR_LEVEL=0 # GPU Direct RDMA 비활성 (단일 노드)
20
+
21
+ # ---- 경로 ----
22
+ PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
23
+ CONFIG="${PROJECT_ROOT}/configs/hybrid_3b.yaml"
24
+ TRAIN_DATA="${PROJECT_ROOT}/data/3b_train.bin"
25
+ VAL_DATA="${PROJECT_ROOT}/data/3b_val.bin"
26
+ CKPT_DIR="${PROJECT_ROOT}/checkpoints/hybrid_3b_run1"
27
+ LOG_FILE="${PROJECT_ROOT}/logs/hybrid_3b_train.log"
28
+
29
+ # ---- 디렉토리 생성 ----
30
+ mkdir -p "${CKPT_DIR}"
31
+ mkdir -p "$(dirname ${LOG_FILE})"
32
+
33
+ cd "${PROJECT_ROOT}"
34
+
35
+ echo "============================================"
36
+ echo " FRANKENSTALLM-H 3B Hybrid Training"
37
+ echo " Config: ${CONFIG}"
38
+ echo " Data: ${TRAIN_DATA}"
39
+ echo " Checkpoint: ${CKPT_DIR}"
40
+ echo " Started: $(date '+%Y-%m-%d %H:%M:%S')"
41
+ echo "============================================"
42
+
43
+ # ---- 학습 실행 (8 GPU DDP) ----
44
+ torchrun \
45
+ --nproc_per_node=8 \
46
+ --master_port=29500 \
47
+ train/pretrain.py \
48
+ --config "${CONFIG}" \
49
+ --train_data "${TRAIN_DATA}" \
50
+ --val_data "${VAL_DATA}" \
51
+ --checkpoint_dir "${CKPT_DIR}" \
52
+ --batch_size 4 \
53
+ --lr 2e-4 \
54
+ --weight_decay 0.1 \
55
+ --warmup_steps 2000 \
56
+ --grad_accum 8 \
57
+ --max_steps 57000 \
58
+ --log_file "${LOG_FILE}" \
59
+ --use_fp8 \
60
+ "$@"
61
+
62
+ echo "Training finished at $(date '+%Y-%m-%d %H:%M:%S')"
source/scripts/launch_korean_1b.sh ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_korean_1b.sh — 8-GPU FP8 pretraining launcher for 1B Korean LLM
4
+ #
5
+ # Usage:
6
+ # bash scripts/launch_korean_1b.sh # full run
7
+ # bash scripts/launch_korean_1b.sh --max_steps 500 # quick test
8
+ # bash scripts/launch_korean_1b.sh --resume checkpoints/korean_1b_fp8_run1/checkpoint-0010000
9
+ #
10
+ # Config is read from configs/korean_1b_fp8.yaml (model) + CLI args (train).
11
+ # Effective batch size: 8 (local) × 8 GPU × 4 (grad_accum) × 4096 (seq_len)
12
+ # = 1,048,576 tokens / step
13
+ # Logs: checkpoints/<RUN_NAME>/train.log
14
+ # checkpoints/<RUN_NAME>/tensorboard/
15
+ # =============================================================================
16
+ set -euo pipefail
17
+
18
+ # ---- Configurable defaults --------------------------------------------------
19
+ RUN_NAME="${RUN_NAME:-korean_1b_fp8_run1}"
20
+ CONFIG="${CONFIG:-configs/korean_1b_fp8.yaml}"
21
+ TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
22
+ VAL_DATA="${VAL_DATA:-data/korean_val.bin}"
23
+ CKPT_DIR="checkpoints/${RUN_NAME}"
24
+ LOG_FILE="${CKPT_DIR}/train.log"
25
+ NPROC=8
26
+ MASTER_PORT="${MASTER_PORT:-29501}"
27
+
28
+ # ---- Defaults that can be overridden via extra CLI args --------------------
29
+ MAX_STEPS=34000 # 4 에포크 × 8.91B tokens = 35.6B (Muennighoff 2023: 4에포크 초과 시 val loss 상승)
30
+ BATCH_SIZE=8
31
+ GRAD_ACCUM=4
32
+ WARMUP_STEPS=2000 # 34k steps의 5.9% (기존 4000 = 11.8%로 과도)
33
+ SEED=42
34
+
35
+ # ---- Pass remaining CLI args directly to pretrain.py ----------------------
36
+ EXTRA_ARGS="$@"
37
+
38
+ # ---- B200 / NVSwitch single-node NCCL tuning --------------------------------
39
+ # Single-node NVSwitch (NV18 full-mesh): disable IB to prevent NCCL probing.
40
+ export NCCL_IB_DISABLE=1
41
+ # Use Ring algorithm for large gradient tensors (128M-70B model range).
42
+ export NCCL_ALGO=Ring
43
+ # Simple protocol is optimal for NVLink bulk transfers (vs LL/LL128 for IB).
44
+ export NCCL_PROTO=Simple
45
+ # More channels → better NVSwitch saturation for large all-reduce payloads.
46
+ export NCCL_MIN_NCHANNELS=16
47
+ export NCCL_MAX_NCHANNELS=16
48
+ # Larger NCCL buffer (64 MB) reduces ring synchronisation overhead.
49
+ export NCCL_BUFFSIZE=67108864
50
+ # CPU thread limits (72 cores ÷ 8 ranks = 9; use 4 for DataLoader headroom).
51
+ export OMP_NUM_THREADS=4
52
+ export MKL_NUM_THREADS=4
53
+
54
+ # ---- Setup ------------------------------------------------------------------
55
+ cd "$(dirname "$0")/.." # always run from project root
56
+
57
+ # ---- Pre-flight check: Korean data must exist before launching --------------
58
+ if [[ ! -f "${TRAIN_DATA}" ]]; then
59
+ echo "=================================================================="
60
+ echo " ERROR: Training data not found: ${TRAIN_DATA}"
61
+ echo ""
62
+ echo " You need to run the Korean data pipeline first."
63
+ echo " Example steps:"
64
+ echo " 1. Download / prepare raw Korean corpus"
65
+ echo " 2. Tokenise and pack into binary format:"
66
+ echo " python data/prepare_korean.py --output data/korean_train.bin"
67
+ echo " 3. Re-run this script once the file exists."
68
+ echo "=================================================================="
69
+ exit 1
70
+ fi
71
+
72
+ if [[ ! -f "${VAL_DATA}" ]]; then
73
+ echo "=================================================================="
74
+ echo " ERROR: Validation data not found: ${VAL_DATA}"
75
+ echo ""
76
+ echo " You need to run the Korean data pipeline first."
77
+ echo " Example steps:"
78
+ echo " 1. Download / prepare raw Korean corpus"
79
+ echo " 2. Tokenise and pack into binary format (val split):"
80
+ echo " python data/prepare_korean.py --output_val data/korean_val.bin"
81
+ echo " 3. Re-run this script once the file exists."
82
+ echo "=================================================================="
83
+ exit 1
84
+ fi
85
+
86
+ mkdir -p "${CKPT_DIR}"
87
+
88
+ echo "=================================================================="
89
+ echo " Run name : ${RUN_NAME}"
90
+ echo " Config : ${CONFIG}"
91
+ echo " Train data : ${TRAIN_DATA}"
92
+ echo " Val data : ${VAL_DATA}"
93
+ echo " CKPT dir : ${CKPT_DIR}"
94
+ echo " Log file : ${LOG_FILE}"
95
+ echo " Max steps : ${MAX_STEPS}"
96
+ echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
97
+ echo " Warmup : ${WARMUP_STEPS} steps"
98
+ echo " Master port : ${MASTER_PORT}"
99
+ echo " Started : $(date)"
100
+ echo "=================================================================="
101
+
102
+ # Suppress the harmless flash_attn kernel override warning from all ranks.
103
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
104
+
105
+ torchrun \
106
+ --nproc_per_node=${NPROC} \
107
+ --master_port=${MASTER_PORT} \
108
+ train/pretrain.py \
109
+ --config "${CONFIG}" \
110
+ --train_data "${TRAIN_DATA}" \
111
+ --val_data "${VAL_DATA}" \
112
+ --checkpoint_dir "${CKPT_DIR}" \
113
+ --log_file "${LOG_FILE}" \
114
+ --max_steps ${MAX_STEPS} \
115
+ --batch_size ${BATCH_SIZE} \
116
+ --grad_accum ${GRAD_ACCUM} \
117
+ --warmup_steps ${WARMUP_STEPS} \
118
+ --seed ${SEED} \
119
+ ${EXTRA_ARGS} \
120
+ 2>&1 | grep -v "UserWarning" \
121
+ | grep -v "Warning only once" \
122
+ | grep -v "Overriding a previously" \
123
+ | grep -v "dispatch key:" \
124
+ | grep -v "previous kernel:" \
125
+ | grep -v "new kernel:" \
126
+ | grep -v "operator: flash_attn" \
127
+ | grep -v "registered at /usr/local" \
128
+ | grep -v "self.m.impl" \
129
+ | tee -a "${LOG_FILE}"
130
+
131
+ echo "=================================================================="
132
+ echo " Done : $(date)"
133
+ echo "=================================================================="
source/scripts/launch_korean_3b.sh ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_korean_3b.sh — 8-GPU FP8 pretraining launcher for 3B Korean LLM
4
+ #
5
+ # Usage:
6
+ # bash scripts/launch_korean_3b.sh # full run (~60B tokens)
7
+ # bash scripts/launch_korean_3b.sh --max_steps 50 # quick benchmark
8
+ # bash scripts/launch_korean_3b.sh --resume checkpoints/korean_3b_fp8_run1/checkpoint-XXXXX
9
+ #
10
+ # Effective batch size: 8 (local) × 8 GPU × 4 (grad_accum) × 4096 (seq_len)
11
+ # = 1,048,576 tokens / step
12
+ # =============================================================================
13
+ set -euo pipefail
14
+
15
+ RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
16
+ CONFIG="${CONFIG:-configs/3b_pretrain.yaml}"
17
+ CKPT_DIR="checkpoints/${RUN_NAME}"
18
+ LOG_FILE="${CKPT_DIR}/train.log"
19
+ NPROC=8
20
+ MASTER_PORT="${MASTER_PORT:-29502}"
21
+
22
+ MAX_STEPS=57000
23
+ BATCH_SIZE=4
24
+ GRAD_ACCUM=8
25
+ LR=1.5e-4
26
+ WARMUP_STEPS=2000
27
+ SEED=42
28
+
29
+ EXTRA_ARGS="$@"
30
+
31
+ # ---- B200 / NVSwitch NCCL tuning -------------------------------------------
32
+ export NCCL_IB_DISABLE=1
33
+ export NCCL_ALGO=Ring
34
+ export NCCL_PROTO=Simple
35
+ export NCCL_MIN_NCHANNELS=16
36
+ export NCCL_MAX_NCHANNELS=16
37
+ export NCCL_BUFFSIZE=67108864
38
+ export OMP_NUM_THREADS=4
39
+ export MKL_NUM_THREADS=4
40
+
41
+ # cd FIRST — 이후 상대경로 체크가 프로젝트 루트 기준으로 동작
42
+ cd "$(dirname "$0")/.."
43
+
44
+ # TRAIN_DATA fallback: cd 이후에 상대경로 체크
45
+ if [[ -f "data/merged_3b_train.bin" ]]; then
46
+ TRAIN_DATA="${TRAIN_DATA:-data/merged_3b_train.bin}"
47
+ echo "Using merged training data: data/merged_3b_train.bin"
48
+ elif [[ -f "data/korean_train.bin" ]]; then
49
+ TRAIN_DATA="${TRAIN_DATA:-data/korean_train.bin}"
50
+ echo "Using fallback training data: data/korean_train.bin"
51
+ else
52
+ echo "ERROR: No training data found (data/merged_3b_train.bin or data/korean_train.bin)"
53
+ exit 1
54
+ fi
55
+
56
+ # VAL_DATA fallback: cd 이후에 상대경로 체크
57
+ VAL_DATA="${VAL_DATA:-data/merged_3b_val.bin}"
58
+ if [[ ! -f "${VAL_DATA}" ]]; then
59
+ VAL_DATA="data/korean_val.bin"
60
+ fi
61
+
62
+ if [[ ! -f "${TRAIN_DATA}" ]]; then
63
+ echo "ERROR: Training data not found: ${TRAIN_DATA}"
64
+ exit 1
65
+ fi
66
+ if [[ ! -f "${VAL_DATA}" ]]; then
67
+ echo "ERROR: Validation data not found: ${VAL_DATA}"
68
+ exit 1
69
+ fi
70
+
71
+ mkdir -p "${CKPT_DIR}"
72
+
73
+ echo "=================================================================="
74
+ echo " Run name : ${RUN_NAME}"
75
+ echo " Config : ${CONFIG}"
76
+ echo " Train data : ${TRAIN_DATA}"
77
+ echo " CKPT dir : ${CKPT_DIR}"
78
+ echo " Max steps : ${MAX_STEPS}"
79
+ echo " LR : ${LR}"
80
+ echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
81
+ echo " Started : $(date)"
82
+ echo "=================================================================="
83
+
84
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
85
+
86
+ torchrun \
87
+ --nproc_per_node=${NPROC} \
88
+ --master_port=${MASTER_PORT} \
89
+ train/pretrain.py \
90
+ --config "${CONFIG}" \
91
+ --train_data "${TRAIN_DATA}" \
92
+ --val_data "${VAL_DATA}" \
93
+ --checkpoint_dir "${CKPT_DIR}" \
94
+ --log_file "${LOG_FILE}" \
95
+ --max_steps ${MAX_STEPS} \
96
+ --batch_size ${BATCH_SIZE} \
97
+ --lr ${LR} \
98
+ --grad_accum ${GRAD_ACCUM} \
99
+ --warmup_steps ${WARMUP_STEPS} \
100
+ --seed ${SEED} \
101
+ ${EXTRA_ARGS} \
102
+ 2>&1 | grep -v "UserWarning" \
103
+ | grep -v "Warning only once" \
104
+ | grep -v "Overriding a previously" \
105
+ | grep -v "dispatch key:" \
106
+ | grep -v "previous kernel:" \
107
+ | grep -v "new kernel:" \
108
+ | grep -v "operator: flash_attn" \
109
+ | grep -v "registered at /usr/local" \
110
+ | grep -v "self.m.impl" \
111
+ | tee -a "${LOG_FILE}"
112
+
113
+ echo "=================================================================="
114
+ echo " Done : $(date)"
115
+ echo "=================================================================="
source/scripts/launch_sft.sh ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # launch_sft.sh — 8-GPU FP8 SFT launcher for 1B Korean LLM
4
+ #
5
+ # Usage:
6
+ # bash scripts/launch_sft.sh
7
+ # bash scripts/launch_sft.sh --max_steps 500 # quick test
8
+ # bash scripts/launch_sft.sh --resume checkpoints/korean_1b_sft/checkpoint-0001000
9
+ #
10
+ # Base model: checkpoints/korean_1b_fp8_run1/checkpoint-0034000
11
+ # SFT data: data/sft/train.jsonl
12
+ # =============================================================================
13
+ set -euo pipefail
14
+
15
+ # ---- Configurable defaults --------------------------------------------------
16
+ RUN_NAME="${RUN_NAME:-korean_1b_sft}"
17
+ BASE_CHECKPOINT="${BASE_CHECKPOINT:-checkpoints/korean_1b_fp8_run1/checkpoint-0034000}"
18
+ SFT_DATA="${SFT_DATA:-data/sft/train.jsonl}"
19
+ VAL_DATA="${VAL_DATA:-data/sft/val.jsonl}"
20
+ CKPT_DIR="checkpoints/${RUN_NAME}"
21
+ LOG_FILE="${CKPT_DIR}/train.log"
22
+ NPROC=8
23
+ MASTER_PORT="${MASTER_PORT:-29502}"
24
+
25
+ MAX_STEPS=9000
26
+ BATCH_SIZE=4
27
+ GRAD_ACCUM=2
28
+ LR="2.0e-5"
29
+ WARMUP_STEPS=300
30
+ SEED=42
31
+
32
+ EXTRA_ARGS="$@"
33
+
34
+ # ---- B200 / NVSwitch NCCL tuning (same as pretrain) -------------------------
35
+ export NCCL_IB_DISABLE=1
36
+ export NCCL_ALGO=Ring
37
+ export NCCL_PROTO=Simple
38
+ export NCCL_MIN_NCHANNELS=16
39
+ export NCCL_MAX_NCHANNELS=16
40
+ export NCCL_BUFFSIZE=67108864
41
+ export OMP_NUM_THREADS=4
42
+ export MKL_NUM_THREADS=4
43
+
44
+ cd "$(dirname "$0")/.."
45
+
46
+ # ---- Pre-flight checks ------------------------------------------------------
47
+ if [[ ! -d "${BASE_CHECKPOINT}" ]]; then
48
+ echo "ERROR: Base checkpoint not found: ${BASE_CHECKPOINT}"
49
+ exit 1
50
+ fi
51
+
52
+ if [[ ! -f "${SFT_DATA}" ]]; then
53
+ echo "=================================================================="
54
+ echo " ERROR: SFT training data not found: ${SFT_DATA}"
55
+ echo ""
56
+ echo " Run the data preparation script first:"
57
+ echo " python data/prepare_sft_data.py"
58
+ echo "=================================================================="
59
+ exit 1
60
+ fi
61
+
62
+ mkdir -p "${CKPT_DIR}"
63
+
64
+ echo "=================================================================="
65
+ echo " SFT Fine-Tuning"
66
+ echo " Run name : ${RUN_NAME}"
67
+ echo " Base checkpoint : ${BASE_CHECKPOINT}"
68
+ echo " SFT data : ${SFT_DATA}"
69
+ echo " CKPT dir : ${CKPT_DIR}"
70
+ echo " Log file : ${LOG_FILE}"
71
+ echo " Max steps : ${MAX_STEPS}"
72
+ echo " Batch size : ${BATCH_SIZE} (local) × ${NPROC} GPU × ${GRAD_ACCUM} grad_accum"
73
+ echo " Learning rate : ${LR}"
74
+ echo " Warmup : ${WARMUP_STEPS} steps"
75
+ echo " Master port : ${MASTER_PORT}"
76
+ echo " Started : $(date)"
77
+ echo "=================================================================="
78
+
79
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
80
+
81
+ torchrun \
82
+ --nproc_per_node=${NPROC} \
83
+ --master_port=${MASTER_PORT} \
84
+ train/sft.py \
85
+ --base_checkpoint "${BASE_CHECKPOINT}" \
86
+ --sft_data "${SFT_DATA}" \
87
+ --checkpoint_dir "${CKPT_DIR}" \
88
+ --log_file "${LOG_FILE}" \
89
+ --max_steps ${MAX_STEPS} \
90
+ --batch_size ${BATCH_SIZE} \
91
+ --grad_accum ${GRAD_ACCUM} \
92
+ --lr ${LR} \
93
+ --warmup_steps ${WARMUP_STEPS} \
94
+ --seed ${SEED} \
95
+ --use_fp8 \
96
+ --val_data "${VAL_DATA}" \
97
+ ${EXTRA_ARGS} \
98
+ 2>&1 | grep -v "UserWarning" \
99
+ | grep -v "Warning only once" \
100
+ | grep -v "Overriding a previously" \
101
+ | grep -v "dispatch key:" \
102
+ | grep -v "previous kernel:" \
103
+ | grep -v "new kernel:" \
104
+ | grep -v "operator: flash_attn" \
105
+ | grep -v "registered at /usr/local" \
106
+ | grep -v "self.m.impl" \
107
+ | tee -a "${LOG_FILE}"
108
+
109
+ echo "=================================================================="
110
+ echo " SFT Done : $(date)"
111
+ echo "=================================================================="
source/scripts/migrate_qkv_checkpoint.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Migrate checkpoint from separate Q/K/V projections to fused QKV.
3
+
4
+ Usage:
5
+ python3 scripts/migrate_qkv_checkpoint.py <checkpoint_dir>
6
+
7
+ Migrates both model.pt AND optimizer.pt:
8
+ - model.pt: q_proj/k_proj/v_proj weights → qkv_proj weight
9
+ - optimizer.pt: exp_avg/exp_avg_sq states fused, param indices re-mapped
10
+
11
+ The concatenation order is [Q ; K ; V] along the output (dim-0) axis,
12
+ which matches the split in MultiHeadAttention.forward:
13
+ q, k, v = qkv.split([_q_dim, _kv_dim, _kv_dim], dim=-1)
14
+
15
+ Optimizer layout (group 0 = weight_decay, per layer × 28):
16
+ [i*6+0] q_proj.weight [3072, 3072]
17
+ [i*6+1] k_proj.weight [1024, 3072]
18
+ [i*6+2] v_proj.weight [1024, 3072]
19
+ [i*6+3] out_proj.weight [3072, 3072]
20
+ [i*6+4] fc1_weight [16384, 3072]
21
+ [i*6+5] fc2_weight [3072, 8192]
22
+ After fusion: indices 0,1,2 → single qkv_proj → 4 params per layer.
23
+ """
24
+ import sys
25
+ import torch
26
+ from pathlib import Path
27
+
28
+ N_LAYERS = 28
29
+ OLD_PARAMS_PER_LAYER = 6 # q, k, v, out, fc1, fc2
30
+ NEW_PARAMS_PER_LAYER = 4 # qkv, out, fc1, fc2
31
+
32
+
33
+ def migrate_model(state: dict) -> dict:
34
+ """Fuse Q/K/V projection weights into QKV in model state dict."""
35
+ new_state: dict = {}
36
+ layers_done: set = set()
37
+
38
+ for key, val in state.items():
39
+ if ".q_proj." not in key and ".k_proj." not in key and ".v_proj." not in key:
40
+ new_state[key] = val
41
+ continue
42
+
43
+ if ".q_proj." not in key:
44
+ continue
45
+
46
+ prefix = key.rsplit(".", 2)[0]
47
+ suffix = key.rsplit(".", 1)[-1]
48
+
49
+ tag = (prefix, suffix)
50
+ if tag in layers_done:
51
+ continue
52
+ layers_done.add(tag)
53
+
54
+ q_key = f"{prefix}.q_proj.{suffix}"
55
+ k_key = f"{prefix}.k_proj.{suffix}"
56
+ v_key = f"{prefix}.v_proj.{suffix}"
57
+
58
+ missing = [k for k in (q_key, k_key, v_key) if k not in state]
59
+ if missing:
60
+ raise KeyError(f"Expected keys not found in checkpoint: {missing}")
61
+
62
+ q_w, k_w, v_w = state[q_key], state[k_key], state[v_key]
63
+ fused = torch.cat([q_w, k_w, v_w], dim=0)
64
+ fused_key = f"{prefix}.qkv_proj.{suffix}"
65
+ new_state[fused_key] = fused
66
+ print(f" Fused {fused_key}: {list(fused.shape)}"
67
+ f" (q={list(q_w.shape)}, k={list(k_w.shape)}, v={list(v_w.shape)})")
68
+
69
+ leaked = [k for k in new_state if ".q_proj." in k or ".k_proj." in k or ".v_proj." in k]
70
+ if leaked:
71
+ raise RuntimeError(f"BUG: old projection keys still present: {leaked}")
72
+
73
+ return new_state
74
+
75
+
76
+ def migrate_optimizer(opt_state: dict) -> dict:
77
+ """Fuse optimizer states for Q/K/V → QKV and re-index parameters.
78
+
79
+ The optimizer has 2 param groups:
80
+ Group 0 (weight_decay): 168 = 28 layers × 6 (q,k,v,out,fc1,fc2)
81
+ Group 1 (no weight_decay): 58 = norms + embedding
82
+
83
+ We fuse q,k,v entries in group 0 (indices i*6+0,1,2 → one entry per layer).
84
+ Group 0 shrinks from 168 to 112 (28 layers × 4 params).
85
+ Group 1 stays at 58. Total: 170.
86
+ """
87
+ old_state = opt_state["state"]
88
+ old_groups = opt_state["param_groups"]
89
+
90
+ group0_count = len(old_groups[0]["params"])
91
+ expected_g0 = N_LAYERS * OLD_PARAMS_PER_LAYER
92
+ if group0_count != expected_g0:
93
+ raise ValueError(
94
+ f"Group 0 has {group0_count} params, expected {expected_g0}. "
95
+ f"Cannot auto-detect QKV layout."
96
+ )
97
+
98
+ # Validate shapes for first layer
99
+ shapes = []
100
+ for j in range(OLD_PARAMS_PER_LAYER):
101
+ idx = old_groups[0]["params"][j]
102
+ shapes.append(list(old_state[idx]["exp_avg"].shape))
103
+ expected_shapes = [[3072, 3072], [1024, 3072], [1024, 3072],
104
+ [3072, 3072], [16384, 3072], [3072, 8192]]
105
+ if shapes != expected_shapes:
106
+ raise ValueError(
107
+ f"Layer 0 shapes {shapes} don't match expected {expected_shapes}. "
108
+ f"Cannot auto-detect QKV layout."
109
+ )
110
+ print(f" Shape validation passed for layer 0.")
111
+
112
+ new_state_entries = {}
113
+ new_idx = 0
114
+
115
+ # --- Group 0: fuse q/k/v per layer ---
116
+ for layer_i in range(N_LAYERS):
117
+ base = layer_i * OLD_PARAMS_PER_LAYER
118
+ q_opt_idx = old_groups[0]["params"][base + 0]
119
+ k_opt_idx = old_groups[0]["params"][base + 1]
120
+ v_opt_idx = old_groups[0]["params"][base + 2]
121
+
122
+ q_entry = old_state[q_opt_idx]
123
+ k_entry = old_state[k_opt_idx]
124
+ v_entry = old_state[v_opt_idx]
125
+
126
+ # Fuse QKV
127
+ fused_entry = {"step": q_entry["step"]}
128
+ for field in ["exp_avg", "exp_avg_sq"]:
129
+ if field in q_entry:
130
+ fused_entry[field] = torch.cat(
131
+ [q_entry[field], k_entry[field], v_entry[field]], dim=0
132
+ )
133
+ new_state_entries[new_idx] = fused_entry
134
+ if layer_i == 0:
135
+ print(f" Layer 0 QKV fused: exp_avg {list(fused_entry['exp_avg'].shape)}")
136
+ new_idx += 1
137
+
138
+ # Copy remaining params (out, fc1, fc2)
139
+ for offset in [3, 4, 5]:
140
+ opt_idx = old_groups[0]["params"][base + offset]
141
+ new_state_entries[new_idx] = old_state[opt_idx]
142
+ new_idx += 1
143
+
144
+ new_group0_count = new_idx # should be N_LAYERS * NEW_PARAMS_PER_LAYER = 112
145
+ print(f" Group 0: {group0_count} → {new_group0_count} params")
146
+
147
+ # --- Group 1: copy as-is (norms, embedding — no QKV) ---
148
+ group1_count = len(old_groups[1]["params"])
149
+ for j in range(group1_count):
150
+ opt_idx = old_groups[1]["params"][j]
151
+ if opt_idx in old_state:
152
+ new_state_entries[new_idx] = old_state[opt_idx]
153
+ new_idx += 1
154
+ print(f" Group 1: {group1_count} → {group1_count} params (unchanged)")
155
+
156
+ # Build new param_groups
157
+ new_groups = []
158
+ g0 = {k: v for k, v in old_groups[0].items() if k != "params"}
159
+ g0["params"] = list(range(0, new_group0_count))
160
+ new_groups.append(g0)
161
+
162
+ g1 = {k: v for k, v in old_groups[1].items() if k != "params"}
163
+ g1["params"] = list(range(new_group0_count, new_group0_count + group1_count))
164
+ new_groups.append(g1)
165
+
166
+ total = new_group0_count + group1_count
167
+ print(f" Total: {len(old_state)} → {total} optimizer params")
168
+
169
+ return {"state": new_state_entries, "param_groups": new_groups}
170
+
171
+
172
+ def migrate(ckpt_dir: Path) -> None:
173
+ model_path = ckpt_dir / "model.pt"
174
+ opt_path = ckpt_dir / "optimizer.pt"
175
+
176
+ if not model_path.exists():
177
+ raise FileNotFoundError(f"model.pt not found in {ckpt_dir}")
178
+
179
+ # --- Model migration ---
180
+ print(f"[1/2] Migrating model weights from {model_path} ...")
181
+ state = torch.load(model_path, map_location="cpu", weights_only=True)
182
+
183
+ has_old = any(".q_proj." in k for k in state)
184
+ has_new = any(".qkv_proj." in k for k in state)
185
+
186
+ if has_new and not has_old:
187
+ print(" Model already migrated. Skipping.")
188
+ elif has_old:
189
+ new_model_state = migrate_model(state)
190
+ torch.save(new_model_state, model_path)
191
+ print(f" Model saved.")
192
+ else:
193
+ raise RuntimeError("Model state has neither q_proj nor qkv_proj keys!")
194
+
195
+ # --- Optimizer migration ---
196
+ if opt_path.exists():
197
+ print(f"\n[2/2] Migrating optimizer states from {opt_path} ...")
198
+ opt = torch.load(opt_path, map_location="cpu", weights_only=True)
199
+
200
+ # Check if already migrated
201
+ total_params = sum(len(pg["params"]) for pg in opt["param_groups"])
202
+ expected_old = N_LAYERS * OLD_PARAMS_PER_LAYER + 58 # 168 + 58 = 226
203
+ expected_new = N_LAYERS * NEW_PARAMS_PER_LAYER + 58 # 112 + 58 = 170
204
+
205
+ if total_params == expected_old:
206
+ opt_backup = ckpt_dir / "optimizer.pt.backup_pre_qkv"
207
+ if not opt_backup.exists():
208
+ torch.save(opt, opt_backup)
209
+ print(f" Backup: {opt_backup}")
210
+ new_opt = migrate_optimizer(opt)
211
+ torch.save(new_opt, opt_path)
212
+ print(f" Optimizer saved.")
213
+ elif total_params == expected_new:
214
+ print(f" Optimizer already migrated ({total_params} params). Skipping.")
215
+ else:
216
+ print(f" [WARN] Unexpected param count {total_params} "
217
+ f"(expected old={expected_old} or new={expected_new}). "
218
+ f"Deleting optimizer.pt — optimizer will restart fresh.")
219
+ opt_path.unlink()
220
+ else:
221
+ print("\n[2/2] No optimizer.pt found. Optimizer will restart fresh.")
222
+
223
+ print("\nMigration complete!")
224
+
225
+
226
+ if __name__ == "__main__":
227
+ if len(sys.argv) != 2:
228
+ print(__doc__)
229
+ sys.exit(1)
230
+ migrate(Path(sys.argv[1]))
source/scripts/monitor_3b.sh ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # monitor_3b.sh — 3B 학습 실시간 모니터링 + 이상 감지 + 자동 체크포인트 정리
4
+ #
5
+ # Usage:
6
+ # bash scripts/monitor_3b.sh # 기본 감시
7
+ # bash scripts/monitor_3b.sh --check-once # 1회 검사
8
+ # bash scripts/monitor_3b.sh --auto-cleanup # 자동 오래된 체크포인트 삭제
9
+ #
10
+ # 3B 특화 사항:
11
+ # - 체크포인트 27GB/개 → 디스크 감시 강화
12
+ # - NCCL hang 감지 + 자동 재시작 옵션
13
+ # - 예상 완료 시간 실시간 계산
14
+ # - 프로세스 중복 실행 방지
15
+ # =============================================================================
16
+ set -euo pipefail
17
+
18
+ # ---- Configuration ----------------------------------------------------------
19
+ RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}"
20
+ LOG_FILE="${1:-checkpoints/${RUN_NAME}/train.log}"
21
+ CKPT_DIR="checkpoints/${RUN_NAME}"
22
+ CHECK_INTERVAL=60 # 3B는 step 간격 더 김 → 60초
23
+ ZERO_LOSS_THRESHOLD=3
24
+ GNORM_WARN=10.0
25
+ GNORM_CRITICAL=50.0
26
+ LOSS_SPIKE_FACTOR=3.0
27
+ STALL_TIMEOUT=600 # 10분 (3B는 step 더 오래 걸림)
28
+ DISK_WARN_PCT=85
29
+ DISK_CRITICAL_PCT=92
30
+ GPU_UTIL_WARN=50
31
+ MAX_CHECKPOINTS=15 # 최대 보관 체크포인트 수 (15 × 27GB = 405GB)
32
+ CHECK_ONCE=false
33
+ AUTO_CLEANUP=false
34
+ AUTO_RESTART=false
35
+
36
+ # Parse args
37
+ for arg in "$@"; do
38
+ case "$arg" in
39
+ --check-once) CHECK_ONCE=true ;;
40
+ --auto-cleanup) AUTO_CLEANUP=true ;;
41
+ --auto-restart) AUTO_RESTART=true ;;
42
+ esac
43
+ done
44
+ # Fix LOG_FILE if first arg was a flag
45
+ if [[ "$LOG_FILE" == --* ]]; then
46
+ LOG_FILE="checkpoints/${RUN_NAME}/train.log"
47
+ fi
48
+
49
+ # ---- Colors -----------------------------------------------------------------
50
+ RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m'
51
+ CYAN='\033[0;36m'; MAGENTA='\033[0;35m'; NC='\033[0m'
52
+
53
+ timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
54
+
55
+ alert() {
56
+ local level="$1" msg="$2"
57
+ case "$level" in
58
+ CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
59
+ WARNING) echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING] ${msg}${NC}" ;;
60
+ INFO) echo -e "${CYAN}🟡 [$(timestamp)] [INFO] ${msg}${NC}" ;;
61
+ OK) echo -e "${GREEN}✅ [$(timestamp)] [OK] ${msg}${NC}" ;;
62
+ esac
63
+ }
64
+
65
+ # ---- Parse metrics ----------------------------------------------------------
66
+ parse_metrics() {
67
+ local n="${1:-20}"
68
+ [[ -f "$LOG_FILE" ]] || return
69
+ tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
70
+ }
71
+
72
+ extract_field() {
73
+ echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
74
+ }
75
+
76
+ extract_step() {
77
+ echo "$1" | grep -oP "step\s+\K[0-9]+" | head -1
78
+ }
79
+
80
+ # ---- Check: Loss = 0 -------------------------------------------------------
81
+ check_loss_zero() {
82
+ local lines
83
+ lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
84
+ [[ -z "$lines" ]] && return 0
85
+ local zero_count=0
86
+ while IFS= read -r line; do
87
+ local loss=$(extract_field "$line" "loss")
88
+ if [[ -n "$loss" ]] && (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
89
+ ((zero_count++))
90
+ fi
91
+ done <<< "$lines"
92
+ if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
93
+ alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그. 즉시 중단!"
94
+ return 1
95
+ fi
96
+ }
97
+
98
+ # ---- Check: Loss spike -----------------------------------------------------
99
+ check_loss_spike() {
100
+ local lines=$(parse_metrics 20)
101
+ [[ -z "$lines" ]] && return 0
102
+ local losses=()
103
+ while IFS= read -r line; do
104
+ local loss=$(extract_field "$line" "loss")
105
+ [[ -n "$loss" ]] && losses+=("$loss")
106
+ done <<< "$lines"
107
+ local count=${#losses[@]}
108
+ [[ $count -lt 5 ]] && return 0
109
+ local last="${losses[$((count-1))]}"
110
+ local sum=0
111
+ for ((i=0; i<count-1; i++)); do
112
+ sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
113
+ done
114
+ local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
115
+ if [[ "$avg" != "0" ]]; then
116
+ local ratio=$(echo "$last / $avg" | bc -l 2>/dev/null || echo "1")
117
+ if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
118
+ alert WARNING "Loss spike! 현재=${last}, 평균=${avg}, 비율=${ratio}x"
119
+ fi
120
+ fi
121
+ }
122
+
123
+ # ---- Check: Gradient norm ---------------------------------------------------
124
+ check_gnorm() {
125
+ local lines=$(parse_metrics 5)
126
+ [[ -z "$lines" ]] && return 0
127
+ local gnorm=$(extract_field "$(echo "$lines" | tail -1)" "gnorm")
128
+ [[ -z "$gnorm" ]] && return 0
129
+ if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
130
+ alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전."
131
+ elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
132
+ alert WARNING "GNorm=${gnorm} 불안정."
133
+ fi
134
+ }
135
+
136
+ # ---- Check: Stall / NCCL hang ----------------------------------------------
137
+ check_stall() {
138
+ [[ ! -f "$LOG_FILE" ]] && return 0
139
+ local last_mod=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
140
+ local now=$(date +%s)
141
+ local diff=$((now - last_mod))
142
+ if [[ $diff -gt $STALL_TIMEOUT ]]; then
143
+ alert CRITICAL "로그 ${diff}초 ($(( diff/60 ))분) 멈춤! NCCL hang 가능성."
144
+ # NCCL hang 자동 재시작
145
+ if $AUTO_RESTART; then
146
+ alert WARNING "자동 재시작 시도..."
147
+ local pid=$(pgrep -f "pretrain.py.*korean_3b" | head -1 || true)
148
+ if [[ -n "$pid" ]]; then
149
+ kill -9 "$pid" 2>/dev/null || true
150
+ sleep 5
151
+ alert INFO "이전 프로세스 종료. launch_3b_pretrain.sh 재실행 필요."
152
+ fi
153
+ fi
154
+ fi
155
+ }
156
+
157
+ # ---- Check: Disk (3B 강화) --------------------------------------------------
158
+ check_disk() {
159
+ local usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
160
+ if [[ -n "$usage" && "$usage" -gt "$DISK_CRITICAL_PCT" ]]; then
161
+ alert CRITICAL "디스크 ${usage}% > ${DISK_CRITICAL_PCT}%! 즉시 정리 필요!"
162
+ $AUTO_CLEANUP && cleanup_old_checkpoints
163
+ elif [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
164
+ alert WARNING "디스크 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 권장."
165
+ fi
166
+ }
167
+
168
+ # ---- Check: GPU utilization -------------------------------------------------
169
+ check_gpu() {
170
+ command -v nvidia-smi &>/dev/null || return 0
171
+ local low=0 total=0
172
+ while IFS= read -r util; do
173
+ ((total++))
174
+ [[ "$util" -lt "$GPU_UTIL_WARN" ]] && ((low++))
175
+ done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
176
+ [[ $total -gt 0 && $low -gt 0 ]] && alert INFO "${low}/${total} GPU util < ${GPU_UTIL_WARN}%"
177
+ }
178
+
179
+ # ---- Check: 체크포인트 무결성 -----------------------------------------------
180
+ check_checkpoint_integrity() {
181
+ local latest=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
182
+ [[ -z "$latest" ]] && return 0
183
+ # 최소 파일 존재 확인
184
+ if [[ ! -f "${latest}/model.pt" ]] && [[ ! -f "${latest}/model.safetensors" ]]; then
185
+ alert WARNING "최근 체크포인트에 모델 파일 없음: ${latest}"
186
+ fi
187
+ # 크기 확인 (3B model.pt는 최소 2GB)
188
+ local size=$(du -sb "${latest}" 2>/dev/null | awk '{print $1}')
189
+ if [[ -n "$size" && "$size" -lt 2000000000 ]]; then
190
+ alert WARNING "체크포인트 크기 비정상 (${size} bytes < 2GB): ${latest}"
191
+ fi
192
+ }
193
+
194
+ # ---- Cleanup: 오래된 체크포인트 자동 삭제 ------------------------------------
195
+ cleanup_old_checkpoints() {
196
+ local ckpts=($(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V))
197
+ local count=${#ckpts[@]}
198
+ if [[ $count -le $MAX_CHECKPOINTS ]]; then
199
+ alert OK "체크포인트 ${count}개 ≤ ${MAX_CHECKPOINTS}. 정리 불필요."
200
+ return
201
+ fi
202
+ # 이정표 체크포인트 보존 (매 10K step)
203
+ local deletable=()
204
+ local preserved=()
205
+ for ckpt in "${ckpts[@]}"; do
206
+ local step_num=$(basename "$ckpt" | grep -oP '\d+' || echo "0")
207
+ if (( step_num % 10000 == 0 && step_num > 0 )); then
208
+ preserved+=("$ckpt")
209
+ else
210
+ deletable+=("$ckpt")
211
+ fi
212
+ done
213
+ # 최근 MAX_CHECKPOINTS개는 무조건 보존
214
+ local n_deletable=${#deletable[@]}
215
+ local total_keep=$(( ${#preserved[@]} + MAX_CHECKPOINTS ))
216
+ local to_delete=$(( count - total_keep ))
217
+ [[ $to_delete -le 0 ]] && { alert OK "정리 불필요 (이정표 ${#preserved[@]}개 + 최근 ${MAX_CHECKPOINTS}개 보존)."; return; }
218
+ alert INFO "${count}개 체크포인트 → ${to_delete}개 삭제 (이정표 ${#preserved[@]}개 영구 보존)"
219
+ local deleted=0
220
+ for ckpt in "${deletable[@]}"; do
221
+ [[ $deleted -ge $to_delete ]] && break
222
+ local ckpt_size=$(du -sh "$ckpt" 2>/dev/null | awk '{print $1}')
223
+ echo " 삭제: $ckpt (${ckpt_size})"
224
+ rm -rf "$ckpt"
225
+ ((deleted++))
226
+ done
227
+ alert OK "체크포인트 정리 완료. (${deleted}개 삭제)"
228
+ }
229
+
230
+ # ---- ETA 계산 ---------------------------------------------------------------
231
+ estimate_eta() {
232
+ [[ ! -f "$LOG_FILE" ]] && return
233
+ # 최근 step 번호 + 시간
234
+ local lines=$(parse_metrics 50)
235
+ [[ -z "$lines" ]] && return
236
+ local last_line=$(echo "$lines" | tail -1)
237
+ local first_line=$(echo "$lines" | head -1)
238
+ local cur_step=$(extract_step "$last_line")
239
+ local max_steps=$(grep -oP "max_steps.*?(\d+)" "${CKPT_DIR}/train.log" 2>/dev/null | head -1 | grep -oP '\d+$' || echo "57000")
240
+
241
+ [[ -z "$cur_step" || "$cur_step" == "0" ]] && return
242
+
243
+ # step/sec from log timestamps (approximate)
244
+ local remaining=$((max_steps - cur_step))
245
+ if [[ $remaining -le 0 ]]; then
246
+ echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (완료!)${NC}"
247
+ return
248
+ fi
249
+
250
+ # 파일 수정 시간 기반 rough ETA
251
+ local first_time=$(head -20 "$LOG_FILE" 2>/dev/null | grep -oP '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}' | head -1 || true)
252
+ if [[ -n "$first_time" ]]; then
253
+ local start_epoch=$(date -d "$first_time" +%s 2>/dev/null || echo 0)
254
+ local now=$(date +%s)
255
+ if [[ $start_epoch -gt 0 && $cur_step -gt 0 ]]; then
256
+ local elapsed=$((now - start_epoch))
257
+ local sec_per_step=$(echo "$elapsed / $cur_step" | bc -l 2>/dev/null || echo "0")
258
+ local eta_sec=$(echo "$remaining * $sec_per_step" | bc 2>/dev/null | cut -d. -f1 || echo "0")
259
+ local eta_hours=$(echo "$eta_sec / 3600" | bc 2>/dev/null || echo "?")
260
+ local pct=$(echo "scale=1; $cur_step * 100 / $max_steps" | bc 2>/dev/null || echo "?")
261
+ echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (${pct}%) | 남은 시간: ~${eta_hours}h | ${sec_per_step}s/step${NC}"
262
+ fi
263
+ else
264
+ echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps}${NC}"
265
+ fi
266
+ }
267
+
268
+ # ---- Status summary ---------------------------------------------------------
269
+ print_status() {
270
+ local lines=$(parse_metrics 1)
271
+ [[ -n "$lines" ]] && echo -e "${GREEN}최근:${NC} $lines"
272
+ estimate_eta
273
+ if command -v nvidia-smi &>/dev/null; then
274
+ echo -e "${CYAN}GPU:${NC}"
275
+ nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu \
276
+ --format=csv,noheader 2>/dev/null | head -8
277
+ fi
278
+ local ckpt_count=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | wc -l)
279
+ local ckpt_size=$(du -sh "${CKPT_DIR}" 2>/dev/null | awk '{print $1}')
280
+ echo -e "${CYAN}체크포인트:${NC} ${ckpt_count}개 (${ckpt_size})"
281
+ local disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print $3"/"$2" ("$5")"}')
282
+ echo -e "${CYAN}디스크:${NC} ${disk}"
283
+ }
284
+
285
+ # ---- Main -------------------------------------------------------------------
286
+ echo "=================================================================="
287
+ echo " 3B Training Monitor"
288
+ echo " Run: ${RUN_NAME}"
289
+ echo " Log: ${LOG_FILE}"
290
+ echo " Interval: ${CHECK_INTERVAL}s"
291
+ echo " Auto-cleanup: ${AUTO_CLEANUP} | Auto-restart: ${AUTO_RESTART}"
292
+ echo " Ctrl+C to stop"
293
+ echo "=================================================================="
294
+
295
+ run_all_checks() {
296
+ check_loss_zero || true
297
+ check_loss_spike || true
298
+ check_gnorm || true
299
+ check_stall || true
300
+ check_disk || true
301
+ check_gpu || true
302
+ check_checkpoint_integrity || true
303
+ echo "---"
304
+ print_status
305
+ echo ""
306
+ }
307
+
308
+ if $CHECK_ONCE; then
309
+ run_all_checks
310
+ exit 0
311
+ fi
312
+
313
+ while true; do
314
+ run_all_checks
315
+ sleep "$CHECK_INTERVAL"
316
+ done
source/scripts/monitor_training.sh ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # monitor_training.sh — SFT 학습 실시간 모니터링 + 이상 감지
4
+ #
5
+ # Usage:
6
+ # bash scripts/monitor_training.sh # 기본 로그 경로
7
+ # bash scripts/monitor_training.sh /path/to/train.log # 커스텀 경로
8
+ # bash scripts/monitor_training.sh --check-once # 1회 검사 후 종료
9
+ #
10
+ # 감시 항목:
11
+ # 🔴 loss = 0.0000 (3 step 연속) → Labels 버그
12
+ # 🔴 gnorm > 50.0 → 발산 직전
13
+ # 🔴 로그 5분 이상 멈춤 → Hang
14
+ # 🟠 loss spike (3× 이동평균) → Bad batch / LR
15
+ # 🟠 gnorm > 10.0 → 불안정
16
+ # 🟠 디스크 > 80% → 정리 필요
17
+ # 🟡 GPU util < 50% → 병목
18
+ # =============================================================================
19
+ set -euo pipefail
20
+
21
+ # ---- Configuration ----------------------------------------------------------
22
+ LOG_FILE="${1:-checkpoints/korean_1b_sft/train.log}"
23
+ CHECK_INTERVAL=30 # 초 단위 폴링 간격
24
+ ZERO_LOSS_THRESHOLD=3 # N회 연속 loss=0이면 경고
25
+ GNORM_WARN=10.0
26
+ GNORM_CRITICAL=50.0
27
+ LOSS_SPIKE_FACTOR=3.0 # 이동평균 대비 N배 이상이면 spike
28
+ STALL_TIMEOUT=300 # 초 (5분) 로그 멈춤 감지
29
+ DISK_WARN_PCT=80
30
+ GPU_UTIL_WARN=50
31
+ CHECK_ONCE=false
32
+
33
+ if [[ "${1:-}" == "--check-once" ]]; then
34
+ CHECK_ONCE=true
35
+ LOG_FILE="${2:-checkpoints/korean_1b_sft/train.log}"
36
+ fi
37
+
38
+ # ---- Colors -----------------------------------------------------------------
39
+ RED='\033[0;31m'
40
+ YELLOW='\033[1;33m'
41
+ GREEN='\033[0;32m'
42
+ CYAN='\033[0;36m'
43
+ NC='\033[0m'
44
+
45
+ # ---- Helper -----------------------------------------------------------------
46
+ timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
47
+
48
+ alert() {
49
+ local level="$1" msg="$2"
50
+ case "$level" in
51
+ CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
52
+ WARNING) echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING] ${msg}${NC}" ;;
53
+ INFO) echo -e "${CYAN}🟡 [$(timestamp)] [INFO] ${msg}${NC}" ;;
54
+ OK) echo -e "${GREEN}✅ [$(timestamp)] [OK] ${msg}${NC}" ;;
55
+ esac
56
+ }
57
+
58
+ # ---- Parse last N log lines -------------------------------------------------
59
+ parse_metrics() {
60
+ # 로그 형식: [timestamp] [INFO] step XXXX | loss X.XXXX | lr X.XXe-XX | gnorm X.XXX | ...
61
+ local n="${1:-20}"
62
+ if [[ ! -f "$LOG_FILE" ]]; then
63
+ echo ""
64
+ return
65
+ fi
66
+ tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
67
+ }
68
+
69
+ extract_field() {
70
+ # $1=line, $2=field name (loss, gnorm, lr)
71
+ echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
72
+ }
73
+
74
+ # ---- Check functions --------------------------------------------------------
75
+
76
+ check_loss_zero() {
77
+ local lines
78
+ lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
79
+ if [[ -z "$lines" ]]; then return; fi
80
+
81
+ local zero_count=0
82
+ while IFS= read -r line; do
83
+ local loss
84
+ loss=$(extract_field "$line" "loss")
85
+ if [[ -n "$loss" ]]; then
86
+ # loss < 0.001
87
+ if (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
88
+ ((zero_count++))
89
+ fi
90
+ fi
91
+ done <<< "$lines"
92
+
93
+ if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
94
+ alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그 가능성. 즉시 학습 중단!"
95
+ return 1
96
+ fi
97
+ return 0
98
+ }
99
+
100
+ check_loss_spike() {
101
+ local lines
102
+ lines=$(parse_metrics 20)
103
+ if [[ -z "$lines" ]]; then return 0; fi
104
+
105
+ local losses=()
106
+ while IFS= read -r line; do
107
+ local loss
108
+ loss=$(extract_field "$line" "loss")
109
+ [[ -n "$loss" ]] && losses+=("$loss")
110
+ done <<< "$lines"
111
+
112
+ local count=${#losses[@]}
113
+ if [[ $count -lt 5 ]]; then return 0; fi
114
+
115
+ # 마지막 값과 이전 평균 비교
116
+ local last_loss="${losses[$((count-1))]}"
117
+ local sum=0
118
+ for ((i=0; i<count-1; i++)); do
119
+ sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
120
+ done
121
+ local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
122
+
123
+ if [[ "$avg" != "0" ]]; then
124
+ local ratio=$(echo "$last_loss / $avg" | bc -l 2>/dev/null || echo "1")
125
+ if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
126
+ alert WARNING "Loss spike 감지! 현재=${last_loss}, 평균=${avg}, 비율=${ratio}x"
127
+ fi
128
+ fi
129
+ return 0
130
+ }
131
+
132
+ check_gnorm() {
133
+ local lines
134
+ lines=$(parse_metrics 5)
135
+ if [[ -z "$lines" ]]; then return 0; fi
136
+
137
+ local last_line
138
+ last_line=$(echo "$lines" | tail -1)
139
+ local gnorm
140
+ gnorm=$(extract_field "$last_line" "gnorm")
141
+
142
+ if [[ -z "$gnorm" ]]; then return 0; fi
143
+
144
+ if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
145
+ alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전. 학습 중단 고려."
146
+ elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
147
+ alert WARNING "GNorm=${gnorm} > ${GNORM_WARN}. 불안정 징후."
148
+ fi
149
+ return 0
150
+ }
151
+
152
+ check_stall() {
153
+ if [[ ! -f "$LOG_FILE" ]]; then
154
+ alert INFO "로그 파일 없음: ${LOG_FILE}"
155
+ return 0
156
+ fi
157
+
158
+ local last_modified
159
+ last_modified=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
160
+ local now
161
+ now=$(date +%s)
162
+ local diff=$((now - last_modified))
163
+
164
+ if [[ $diff -gt $STALL_TIMEOUT ]]; then
165
+ alert CRITICAL "로그가 ${diff}초 ($(( diff/60 ))분) 동안 업데이트 없음! Hang 가능성."
166
+ fi
167
+ return 0
168
+ }
169
+
170
+ check_disk() {
171
+ local usage
172
+ usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
173
+ if [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
174
+ alert WARNING "디스크 사용률 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 필요."
175
+ fi
176
+ return 0
177
+ }
178
+
179
+ check_gpu() {
180
+ if ! command -v nvidia-smi &>/dev/null; then return 0; fi
181
+
182
+ local low_util=0
183
+ local total_gpus=0
184
+ while IFS= read -r util; do
185
+ ((total_gpus++))
186
+ if [[ "$util" -lt "$GPU_UTIL_WARN" ]]; then
187
+ ((low_util++))
188
+ fi
189
+ done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
190
+
191
+ if [[ $total_gpus -gt 0 && $low_util -gt 0 ]]; then
192
+ alert INFO "${low_util}/${total_gpus} GPU utilization < ${GPU_UTIL_WARN}%. 데이터 로딩 병목?"
193
+ fi
194
+ return 0
195
+ }
196
+
197
+ # ---- Status summary ---------------------------------------------------------
198
+ print_status() {
199
+ local lines
200
+ lines=$(parse_metrics 1)
201
+ if [[ -n "$lines" ]]; then
202
+ echo -e "${GREEN}최근 로그:${NC} $lines"
203
+ fi
204
+
205
+ if command -v nvidia-smi &>/dev/null; then
206
+ echo -e "${CYAN}GPU 메모리:${NC}"
207
+ nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu \
208
+ --format=csv,noheader 2>/dev/null | head -8
209
+ fi
210
+
211
+ local disk
212
+ disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print "사용: "$3"/"$2" ("$5")"}')
213
+ echo -e "${CYAN}디스크:${NC} ${disk}"
214
+ }
215
+
216
+ # ---- Main loop --------------------------------------------------------------
217
+ echo "=================================================================="
218
+ echo " SFT Training Monitor"
219
+ echo " Log file: ${LOG_FILE}"
220
+ echo " Check interval: ${CHECK_INTERVAL}s"
221
+ echo " Press Ctrl+C to stop"
222
+ echo "=================================================================="
223
+
224
+ run_all_checks() {
225
+ check_loss_zero || true
226
+ check_loss_spike || true
227
+ check_gnorm || true
228
+ check_stall || true
229
+ check_disk || true
230
+ check_gpu || true
231
+ echo "---"
232
+ print_status
233
+ echo ""
234
+ }
235
+
236
+ if $CHECK_ONCE; then
237
+ run_all_checks
238
+ exit 0
239
+ fi
240
+
241
+ while true; do
242
+ run_all_checks
243
+ sleep "$CHECK_INTERVAL"
244
+ done
source/scripts/openclaw-watchdog.sh ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # openclaw-watchdog.sh — OpenClaw Gateway 헬스체크 + 자동 재시작
4
+ # crontab에 등록하여 1분마다 실행
5
+ #
6
+ # Usage:
7
+ # */1 * * * * /PROJECT/0325120031_A/ghong/taketimes/llm-bang/scripts/openclaw-watchdog.sh
8
+ #
9
+ # 변경이력:
10
+ # 2026-03-01 네트워크 체크를 ICMP→HTTP로 변경 (ICMP 차단 환경 대응)
11
+ # 다중 엔드포인트 fallback, 게이트웨이 HTTP 응답 체크 추가
12
+ # setsid 분리 실행, 상세 로깅 강화
13
+
14
+ set -euo pipefail
15
+
16
+ # ── 설정 ──────────────────────────────────────────────
17
+ RNTIER_HOME="REDACTED_RNTIER_PATH"
18
+ OPENCLAW_BIN="${RNTIER_HOME}/.npm-global/bin/openclaw"
19
+ GATEWAY_PORT=18789
20
+ GATEWAY_HOST="127.0.0.1"
21
+ PID_FILE="/tmp/openclaw-gateway.pid"
22
+ LOG_DIR="/tmp/openclaw"
23
+ LOG_FILE="${LOG_DIR}/watchdog.log"
24
+ GATEWAY_LOG="${LOG_DIR}/gateway.log"
25
+ MAX_LOG_SIZE=$((10 * 1024 * 1024)) # 10MB 로테이션
26
+ RESTART_COOLDOWN=120 # 초 — 재시작 후 이 시간 내 재시도 방지
27
+ LAST_RESTART_FILE="/tmp/openclaw-last-restart"
28
+ CONSECUTIVE_FAIL_FILE="/tmp/openclaw-consecutive-fails"
29
+
30
+ # 환경변수 — openclaw가 config를 찾을 수 있도록
31
+ export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
32
+ export HOME="/home/ghong"
33
+ export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
34
+ export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
35
+
36
+ # ── 함수 ──────────────────────────────────────────────
37
+ mkdir -p "$LOG_DIR"
38
+
39
+ log() {
40
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
41
+ }
42
+
43
+ rotate_log() {
44
+ local file="$1"
45
+ if [[ -f "$file" ]] && [[ $(stat -c%s "$file" 2>/dev/null || echo 0) -gt $MAX_LOG_SIZE ]]; then
46
+ mv "$file" "${file}.old"
47
+ log "Log rotated: $file"
48
+ fi
49
+ }
50
+
51
+ # 게이트웨이의 실제 엔드포인트로 로컬 HTTP 응답 체크
52
+ check_gateway_http() {
53
+ if command -v curl &>/dev/null; then
54
+ curl -sf --max-time 5 -o /dev/null "http://${GATEWAY_HOST}:${GATEWAY_PORT}/__openclaw__/canvas/" 2>/dev/null
55
+ return $?
56
+ fi
57
+ return 1
58
+ }
59
+
60
+ is_port_open() {
61
+ if command -v ss &>/dev/null; then
62
+ ss -tlnH "sport = :${GATEWAY_PORT}" 2>/dev/null | grep -q "$GATEWAY_PORT"
63
+ else
64
+ (echo > /dev/tcp/"$GATEWAY_HOST"/"$GATEWAY_PORT") 2>/dev/null
65
+ fi
66
+ }
67
+
68
+ is_process_alive() {
69
+ if [[ -f "$PID_FILE" ]]; then
70
+ local pid
71
+ pid=$(cat "$PID_FILE" 2>/dev/null)
72
+ if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
73
+ return 0
74
+ fi
75
+ fi
76
+ pgrep -f "openclaw.*gateway" >/dev/null 2>&1
77
+ }
78
+
79
+ # 네트워크 체크 — DNS 해석 기반
80
+ # 이 서버는 ICMP(ping)과 아웃바운드 HTTPS(curl)가 모두 차단됨.
81
+ # 단, DNS 해석은 가능하고 게이트웨이(Node.js)는 long-polling으로 통신 가능.
82
+ # 따라서 DNS 해석 성공 여부로 "네트워크 자체가 살아있는지" 판단한다.
83
+ check_network() {
84
+ # 방법1: getent (가장 빠르고 가벼움)
85
+ if command -v getent &>/dev/null; then
86
+ getent hosts api.telegram.org >/dev/null 2>&1 && return 0
87
+ getent hosts api.anthropic.com >/dev/null 2>&1 && return 0
88
+ fi
89
+ # 방법2: nslookup
90
+ if command -v nslookup &>/dev/null; then
91
+ nslookup -timeout=5 api.telegram.org >/dev/null 2>&1 && return 0
92
+ fi
93
+ # 방법3: /dev/tcp 로 DNS 서버(168.126.63.1) 포트 53 확인
94
+ (echo > /dev/tcp/168.126.63.1/53) 2>/dev/null && return 0
95
+ return 1
96
+ }
97
+
98
+ cooldown_active() {
99
+ if [[ -f "$LAST_RESTART_FILE" ]]; then
100
+ local last_restart now diff
101
+ last_restart=$(cat "$LAST_RESTART_FILE" 2>/dev/null)
102
+ now=$(date +%s)
103
+ diff=$(( now - last_restart ))
104
+ if [[ $diff -lt $RESTART_COOLDOWN ]]; then
105
+ return 0 # 쿨다운 중
106
+ fi
107
+ fi
108
+ return 1 # 쿨다운 아님
109
+ }
110
+
111
+ get_consecutive_fails() {
112
+ if [[ -f "$CONSECUTIVE_FAIL_FILE" ]]; then
113
+ cat "$CONSECUTIVE_FAIL_FILE" 2>/dev/null || echo 0
114
+ else
115
+ echo 0
116
+ fi
117
+ }
118
+
119
+ set_consecutive_fails() {
120
+ echo "$1" > "$CONSECUTIVE_FAIL_FILE"
121
+ }
122
+
123
+ start_gateway() {
124
+ log "ACTION: Starting OpenClaw gateway on port $GATEWAY_PORT..."
125
+
126
+ # 기존 좀비 프로세스 정리
127
+ local old_pids
128
+ old_pids=$(pgrep -f "openclaw.*gateway" 2>/dev/null || true)
129
+ if [[ -n "$old_pids" ]]; then
130
+ log "ACTION: Killing stale gateway processes: $old_pids"
131
+ echo "$old_pids" | xargs kill -9 2>/dev/null || true
132
+ sleep 2
133
+ fi
134
+
135
+ # 게이트웨이 시작 — setsid로 완전 분리 (부모 프로세스 시그널 전파 방지)
136
+ setsid nohup "$OPENCLAW_BIN" gateway run \
137
+ --port "$GATEWAY_PORT" \
138
+ --bind loopback \
139
+ >> "$GATEWAY_LOG" 2>&1 < /dev/null &
140
+
141
+ local new_pid=$!
142
+ echo "$new_pid" > "$PID_FILE"
143
+ date +%s > "$LAST_RESTART_FILE"
144
+
145
+ log "ACTION: Gateway launched with PID $new_pid (setsid)"
146
+
147
+ # 8초 대기 후 확인 (Telegram provider 초기화에 시간 필요)
148
+ sleep 8
149
+ if kill -0 "$new_pid" 2>/dev/null; then
150
+ log "OK: Gateway PID $new_pid is alive after startup"
151
+ if is_port_open; then
152
+ log "OK: Port $GATEWAY_PORT is listening"
153
+ else
154
+ log "WARN: Gateway alive but port $GATEWAY_PORT not yet listening (may need more time)"
155
+ fi
156
+ return 0
157
+ else
158
+ log "ERROR: Gateway PID $new_pid died immediately after start"
159
+ log "ERROR: Last 10 lines of gateway.log:"
160
+ tail -10 "$GATEWAY_LOG" 2>/dev/null | while read -r line; do
161
+ log " | $line"
162
+ done
163
+ return 1
164
+ fi
165
+ }
166
+
167
+ # ── 메인 로직 ─────────────────────────────────────────
168
+ rotate_log "$LOG_FILE"
169
+ rotate_log "$GATEWAY_LOG"
170
+
171
+ # 오래된 openclaw 로그 파일 정리 (7일 이상)
172
+ find "$LOG_DIR" -name "openclaw-*.log" -mtime +7 -delete 2>/dev/null || true
173
+
174
+ # 1) 프로세스 + 포트 체크를 먼저 수행 (게이트웨이가 살아있으면 네트워크 체크 불필요)
175
+ process_ok=false
176
+ port_ok=false
177
+ http_ok=false
178
+
179
+ if is_process_alive; then
180
+ process_ok=true
181
+ fi
182
+
183
+ if is_port_open; then
184
+ port_ok=true
185
+ fi
186
+
187
+ if $port_ok && check_gateway_http; then
188
+ http_ok=true
189
+ fi
190
+
191
+ # 2) 게이트웨이 정상이면 바로 종료
192
+ if $process_ok && $port_ok; then
193
+ if $http_ok; then
194
+ # 완전 정상
195
+ set_consecutive_fails 0
196
+ exit 0
197
+ fi
198
+ # 프로세스+포트 OK인데 HTTP 응답 없음 → hung 가능성
199
+ fails=$(get_consecutive_fails)
200
+ fails=$((fails + 1))
201
+ set_consecutive_fails "$fails"
202
+ log "WARN: Process alive, port open, but HTTP not responding (consecutive: $fails)"
203
+ if [[ $fails -lt 3 ]]; then
204
+ log "INFO: Waiting more cycles before restart (transient check, $fails/3)"
205
+ exit 0
206
+ fi
207
+ log "WARN: HTTP unresponsive for $fails consecutive checks — proceeding to restart"
208
+ fi
209
+
210
+ # 3) 게이트웨이가 비정상 — 네트워크 체크 후 재시작 여부 판단
211
+ if $process_ok && ! $port_ok; then
212
+ log "WARN: Process alive but port $GATEWAY_PORT not listening. Possible hung state."
213
+ fi
214
+
215
+ if ! $process_ok && ! $port_ok; then
216
+ log "WARN: Gateway is completely down (no process, no port)."
217
+ fi
218
+
219
+ if ! $process_ok && $port_ok; then
220
+ log "WARN: No known gateway process but port $GATEWAY_PORT is in use. Stale process?"
221
+ fi
222
+
223
+ # 4) 네트워크 체크 — DNS 기반 (게이트웨이가 죽었을 때만 실행)
224
+ if ! check_network; then
225
+ log "WARN: Network unreachable (DNS resolution failed). Skipping gateway restart."
226
+ exit 0
227
+ fi
228
+
229
+ # 5) 쿨다운 체크
230
+ if cooldown_active; then
231
+ log "INFO: Cooldown active (last restart < ${RESTART_COOLDOWN}s ago). Skipping."
232
+ exit 0
233
+ fi
234
+
235
+ # 6) 재시작
236
+ log "ACTION: Attempting gateway restart..."
237
+ if start_gateway; then
238
+ log "OK: Gateway restart SUCCESS"
239
+ set_consecutive_fails 0
240
+ else
241
+ log "ERROR: Gateway restart FAILED"
242
+ exit 1
243
+ fi
source/scripts/orpo_eval_watchdog.sh ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # =============================================================================
3
+ # ORPO Training Completion Watchdog
4
+ # =============================================================================
5
+ # Monitors the ORPO training process. When it finishes, automatically launches
6
+ # the comprehensive evaluation pipeline.
7
+ #
8
+ # Usage:
9
+ # nohup bash scripts/orpo_eval_watchdog.sh > checkpoints/korean_3b_orpo_v1/watchdog.log 2>&1 &
10
+ # =============================================================================
11
+
12
+ set -euo pipefail
13
+
14
+ PROJECT_ROOT="/PROJECT/0325120031_A/ghong/taketimes/llm-bang"
15
+ TRAIN_LOG="${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/train.log"
16
+ TRAIN_PID=$(pgrep -f "train/orpo.py.*korean_3b_orpo_v1" | head -1)
17
+
18
+ echo "=============================================="
19
+ echo " ORPO Eval Watchdog Started"
20
+ echo "=============================================="
21
+ echo " Time : $(date '+%Y-%m-%d %H:%M:%S')"
22
+ echo " Train PID : ${TRAIN_PID:-NOT FOUND}"
23
+ echo " Train Log : ${TRAIN_LOG}"
24
+ echo "=============================================="
25
+
26
+ if [ -z "${TRAIN_PID}" ]; then
27
+ echo "[WARN] Training process not found. Checking if already completed..."
28
+ # Check if training already finished by looking for final output
29
+ if grep -q "Training completed" "${TRAIN_LOG}" 2>/dev/null || \
30
+ grep -q "Saving model checkpoint" "${TRAIN_LOG}" 2>/dev/null; then
31
+ echo "[INFO] Training appears to have already completed."
32
+ else
33
+ echo "[ERROR] No training process and no completion marker found. Exiting."
34
+ exit 1
35
+ fi
36
+ else
37
+ echo "[INFO] Watching training PID ${TRAIN_PID}..."
38
+ echo ""
39
+
40
+ # Poll every 60 seconds
41
+ while kill -0 "${TRAIN_PID}" 2>/dev/null; do
42
+ # Get current step
43
+ CURRENT_STEP=$(grep -oP '\d+/9840' "${TRAIN_LOG}" 2>/dev/null | tail -1 || echo "?/?")
44
+ LATEST_LOSS=$(grep "'loss':" "${TRAIN_LOG}" 2>/dev/null | tail -1 | grep -oP "'loss': '([^']+)'" | sed "s/'loss': '//;s/'//" || echo "?")
45
+ echo "[$(date '+%H:%M:%S')] Step ${CURRENT_STEP} | Loss: ${LATEST_LOSS} | PID ${TRAIN_PID} running"
46
+ sleep 60
47
+ done
48
+
49
+ echo ""
50
+ echo "=============================================="
51
+ echo "[INFO] Training process ${TRAIN_PID} has ended."
52
+ echo "[INFO] Time: $(date '+%Y-%m-%d %H:%M:%S')"
53
+ echo "=============================================="
54
+ fi
55
+
56
+ # Wait a moment for any final I/O
57
+ sleep 10
58
+
59
+ # Get final training stats
60
+ echo ""
61
+ echo "[INFO] Final training stats:"
62
+ grep "eval_loss" "${TRAIN_LOG}" | tail -1 | tr ',' '\n' | head -10
63
+ echo ""
64
+
65
+ # Detect the latest checkpoint
66
+ LATEST_CKPT=$(ls -d ${PROJECT_ROOT}/checkpoints/korean_3b_orpo_v1/checkpoint-* 2>/dev/null | sort -t- -k2 -n | tail -1)
67
+ echo "[INFO] Latest checkpoint: ${LATEST_CKPT}"
68
+
69
+ if [ -z "${LATEST_CKPT}" ]; then
70
+ echo "[ERROR] No checkpoint found. Cannot proceed with evaluation."
71
+ exit 1
72
+ fi
73
+
74
+ # Send telegram notification (if available)
75
+ python3 -c "
76
+ import os, urllib.request, urllib.parse, json
77
+ token = os.environ.get('TELEGRAM_BOT_TOKEN', '')
78
+ chat_id = os.environ.get('TELEGRAM_CHAT_ID', '')
79
+ if token and chat_id:
80
+ msg = '🏁 ORPO 학습 완료! 자동 평가 시작합니다.\nCheckpoint: ${LATEST_CKPT##*/}'
81
+ url = f'https://api.telegram.org/bot{token}/sendMessage'
82
+ data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode()
83
+ urllib.request.urlopen(url, data, timeout=10)
84
+ print('[INFO] Telegram notification sent.')
85
+ else:
86
+ print('[INFO] Telegram not configured, skipping notification.')
87
+ " 2>/dev/null || true
88
+
89
+ # ============================================================================
90
+ # Launch evaluation pipeline
91
+ # ============================================================================
92
+ echo ""
93
+ echo "=============================================="
94
+ echo " Starting ORPO Evaluation Pipeline"
95
+ echo " Time: $(date '+%Y-%m-%d %H:%M:%S')"
96
+ echo "=============================================="
97
+
98
+ cd "${PROJECT_ROOT}"
99
+
100
+ python3 eval/orpo_eval_pipeline.py \
101
+ --checkpoint "${LATEST_CKPT}" \
102
+ 2>&1 | tee -a checkpoints/korean_3b_orpo_v1/eval.log
103
+
104
+ EVAL_EXIT=$?
105
+
106
+ echo ""
107
+ echo "=============================================="
108
+ echo " Evaluation Complete"
109
+ echo " Exit code: ${EVAL_EXIT}"
110
+ echo " Time: $(date '+%Y-%m-%d %H:%M:%S')"
111
+ echo "=============================================="
112
+
113
+ # Send completion notification
114
+ python3 -c "
115
+ import os, urllib.request, urllib.parse
116
+ token = os.environ.get('TELEGRAM_BOT_TOKEN', '')
117
+ chat_id = os.environ.get('TELEGRAM_CHAT_ID', '')
118
+ if token and chat_id:
119
+ exit_code = ${EVAL_EXIT}
120
+ status = '✅ 성공' if exit_code == 0 else '❌ 실패'
121
+ msg = f'ORPO 평가 완료: {status}\nExit code: {exit_code}\n보고서: reports/ 확인'
122
+ url = f'https://api.telegram.org/bot{token}/sendMessage'
123
+ data = urllib.parse.urlencode({'chat_id': chat_id, 'text': msg}).encode()
124
+ urllib.request.urlopen(url, data, timeout=10)
125
+ " 2>/dev/null || true
126
+
127
+ exit ${EVAL_EXIT}
source/scripts/orpo_hp_sweep.sh ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # orpo_hp_sweep.sh — ORPO Hyperparameter Sweep (200 steps each)
4
+ #
5
+ # 각 설정을 200 steps씩 돌려서 최적 조합을 찾는 스크립트.
6
+ # 결과는 sweep_results/ 디렉토리에 저장됨.
7
+ #
8
+ # Usage:
9
+ # bash scripts/orpo_hp_sweep.sh # 전체 sweep (6 runs)
10
+ # bash scripts/orpo_hp_sweep.sh --dry-run # 설정만 출력
11
+ # =============================================================================
12
+ set -uo pipefail
13
+ # NOTE: set +e — individual runs may fail; we log failures and continue the sweep
14
+
15
+ cd "$(dirname "$0")/.."
16
+
17
+ SWEEP_STEPS=200
18
+ SWEEP_DIR="checkpoints/orpo_sweep"
19
+ RESULTS_FILE="${SWEEP_DIR}/sweep_results.jsonl"
20
+ BASE_MODEL="eval/outputs/hf_3b_sft_best"
21
+ DATA_PATH="data/preference/combined_preference.jsonl"
22
+ NPROC=8
23
+ MASTER_PORT_BASE=29510
24
+
25
+ # B200 NCCL tuning (NVSwitch mesh — let NCCL auto-detect proto/channels/algo)
26
+ export NCCL_IB_DISABLE=1
27
+ export NCCL_BUFFSIZE=134217728
28
+ export OMP_NUM_THREADS=9
29
+ export MKL_NUM_THREADS=9
30
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
31
+ export NCCL_P2P_LEVEL=NVL
32
+ export PYTHONWARNINGS="ignore::UserWarning:torch.library"
33
+
34
+ mkdir -p "${SWEEP_DIR}"
35
+ declare -a FAILED_RUNS=()
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Sweep configurations: (name, beta, lr, max_length, batch_size, grad_accum)
39
+ # ---------------------------------------------------------------------------
40
+ # 핵심 탐색 축:
41
+ # 1. beta: 반복 억제 강도 (0.15 vs 0.25 vs 0.35)
42
+ # 2. lr: 수렴 속도 (5e-6 vs 8e-6 vs 1.2e-5)
43
+ # 3. max_length: VRAM vs 커버리지 (1024 vs 1536)
44
+
45
+ declare -a CONFIGS=(
46
+ # name beta lr max_len bs accum
47
+ "baseline_b015_lr8e6 0.15 8e-6 1536 4 4"
48
+ "baseline_b025_lr8e6 0.25 8e-6 1536 4 4"
49
+ "strong_b035_lr8e6 0.35 8e-6 1536 4 4"
50
+ "fast_b025_lr12e6 0.25 1.2e-5 1536 4 4"
51
+ "conserv_b025_lr5e6 0.25 5e-6 1536 4 4"
52
+ "short_b025_lr8e6 0.25 8e-6 1024 4 4"
53
+ )
54
+
55
+ DRY_RUN=false
56
+ if [[ "${1:-}" == "--dry-run" ]]; then
57
+ DRY_RUN=true
58
+ fi
59
+
60
+ echo "=================================================================="
61
+ echo " ORPO Hyperparameter Sweep"
62
+ echo " Configs: ${#CONFIGS[@]}"
63
+ echo " Steps each: ${SWEEP_STEPS}"
64
+ echo " Results: ${RESULTS_FILE}"
65
+ echo "=================================================================="
66
+
67
+ for i in "${!CONFIGS[@]}"; do
68
+ read -r NAME BETA LR MAX_LEN BS ACCUM <<< "${CONFIGS[$i]}"
69
+ PORT=$((MASTER_PORT_BASE + i))
70
+ OUTPUT="${SWEEP_DIR}/${NAME}"
71
+
72
+ echo ""
73
+ echo "--- Run $((i+1))/${#CONFIGS[@]}: ${NAME} ---"
74
+ echo " beta=${BETA} lr=${LR} max_length=${MAX_LEN} bs=${BS} accum=${ACCUM}"
75
+
76
+ if [[ "${DRY_RUN}" == "true" ]]; then
77
+ echo " [DRY RUN] skipping"
78
+ continue
79
+ fi
80
+
81
+ mkdir -p "${OUTPUT}"
82
+ START_TIME=$(date +%s)
83
+
84
+ torchrun \
85
+ --nproc_per_node=${NPROC} \
86
+ --master_port=${PORT} \
87
+ train/orpo.py \
88
+ --model_path "${BASE_MODEL}" \
89
+ --custom_data_path "${DATA_PATH}" \
90
+ --output_dir "${OUTPUT}" \
91
+ --max_steps ${SWEEP_STEPS} \
92
+ --lr ${LR} \
93
+ --beta ${BETA} \
94
+ --batch_size ${BS} \
95
+ --gradient_accumulation_steps ${ACCUM} \
96
+ --max_length ${MAX_LEN} \
97
+ \
98
+ --weight_decay 0.01 \
99
+ --warmup_ratio 0.05 \
100
+ --eval_split_ratio 0.05 \
101
+ --eval_steps 100 \
102
+ --early_stopping_patience 100 \
103
+ --save_steps 200 \
104
+ --save_total_limit 1 \
105
+ --logging_steps 10 \
106
+ --report_to none \
107
+ --dataset_num_proc 64 \
108
+ --dataloader_num_workers 4 \
109
+ --no_load_best \
110
+ 2>&1 | tee "${OUTPUT}/train.log"
111
+ RUN_EXIT=$?
112
+
113
+ END_TIME=$(date +%s)
114
+ ELAPSED=$((END_TIME - START_TIME))
115
+
116
+ if [[ ${RUN_EXIT} -ne 0 ]]; then
117
+ echo " [ERROR] Run ${NAME} failed with exit code ${RUN_EXIT} after ${ELAPSED}s"
118
+ echo "{\"name\":\"${NAME}\",\"beta\":${BETA},\"lr\":\"${LR}\",\"max_length\":${MAX_LEN},\"status\":\"FAILED\",\"exit_code\":${RUN_EXIT},\"elapsed_s\":${ELAPSED}}" >> "${RESULTS_FILE}"
119
+ FAILED_RUNS+=("${NAME}")
120
+ continue
121
+ fi
122
+
123
+ # Extract final metrics from log
124
+ FINAL_LOSS=$(grep -oP "'loss': '[\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[\d.]+" || echo "N/A")
125
+ EVAL_LOSS=$(grep -oP "'eval_loss': '[\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[\d.]+" || echo "N/A")
126
+ MARGIN=$(grep -oP "'rewards/margins': '[-\d.]+'" "${OUTPUT}/train.log" | tail -1 | grep -oP "[-\d.]+" || echo "N/A")
127
+
128
+ # Save result
129
+ echo "{\"name\":\"${NAME}\",\"beta\":${BETA},\"lr\":\"${LR}\",\"max_length\":${MAX_LEN},\"status\":\"OK\",\"loss\":\"${FINAL_LOSS}\",\"eval_loss\":\"${EVAL_LOSS}\",\"margin\":\"${MARGIN}\",\"elapsed_s\":${ELAPSED}}" >> "${RESULTS_FILE}"
130
+
131
+ echo " -> loss=${FINAL_LOSS} eval_loss=${EVAL_LOSS} margin=${MARGIN} time=${ELAPSED}s"
132
+
133
+ # Cleanup weights to save disk (keep logs)
134
+ rm -rf "${OUTPUT}/checkpoint-"* "${OUTPUT}/emergency_checkpoint" 2>/dev/null || true
135
+ done
136
+
137
+ echo ""
138
+ echo "=================================================================="
139
+ echo " Sweep Complete!"
140
+ echo " Results: ${RESULTS_FILE}"
141
+ if [[ -f "${RESULTS_FILE}" ]]; then
142
+ echo ""
143
+ echo " Summary:"
144
+ cat "${RESULTS_FILE}" | python3 -c "
145
+ import sys, json
146
+ results = [json.loads(l) for l in sys.stdin]
147
+ results.sort(key=lambda r: float(r.get('eval_loss', '999')))
148
+ print(f' {\"Name\":<25} {\"Beta\":>6} {\"LR\":>10} {\"Loss\":>8} {\"EvalLoss\":>10} {\"Margin\":>8} {\"Time\":>6}')
149
+ print(f' {\"-\"*25} {\"-\"*6} {\"-\"*10} {\"-\"*8} {\"-\"*10} {\"-\"*8} {\"-\"*6}')
150
+ for r in results:
151
+ print(f' {r[\"name\"]:<25} {r[\"beta\"]:>6} {r[\"lr\"]:>10} {r[\"loss\"]:>8} {r[\"eval_loss\"]:>10} {r[\"margin\"]:>8} {r[\"elapsed_s\"]:>5}s')
152
+ print()
153
+ best = results[0]
154
+ print(f' BEST: {best[\"name\"]} (eval_loss={best[\"eval_loss\"]})')
155
+ " 2>/dev/null || cat "${RESULTS_FILE}"
156
+ fi
157
+
158
+ # Report failed runs
159
+ if [[ ${#FAILED_RUNS[@]} -gt 0 ]]; then
160
+ echo ""
161
+ echo " FAILED RUNS (${#FAILED_RUNS[@]}):"
162
+ for fname in "${FAILED_RUNS[@]}"; do
163
+ echo " - ${fname}"
164
+ done
165
+ fi
166
+ echo "=================================================================="
source/scripts/prepare_3b_data.sh ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # prepare_3b_data.sh — 3B 모델 학습 데이터 전체 파이프라인
4
+ #
5
+ # 사용법:
6
+ # bash scripts/prepare_3b_data.sh [--step N] [--jobs 72]
7
+ #
8
+ # 스텝:
9
+ # 1 = CulturaX 토큰화
10
+ # 2 = cc100 해제 + 토큰화
11
+ # 3 = OSCAR 토큰화
12
+ # 4 = korean_webtext 토큰화
13
+ # 5 = HPLT 한국어 추출 + 토큰화
14
+ # 6 = textbooks + finepdfs + kovast 토큰화
15
+ # 7 = 전체 병합
16
+ # 8 = train/val split 검증
17
+ # =============================================================================
18
+ set -euo pipefail
19
+
20
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
21
+ PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
22
+ cd "${PROJECT_ROOT}"
23
+
24
+ # ─── 설정 ────────────────────────────────────────────────────────────────
25
+ DATA_DIR="data"
26
+ EXTRA_DIR="data/korean_extra"
27
+ TOKENIZER="tokenizer/tokenizer.json"
28
+ VAL_SPLIT=0.002
29
+ SEED=42
30
+ JOBS=72
31
+ FROM_STEP=0
32
+ LOG_FILE="data/prepare_3b.log"
33
+
34
+ while [[ $# -gt 0 ]]; do
35
+ case $1 in
36
+ --step) FROM_STEP="$2"; shift 2 ;;
37
+ --jobs) JOBS="$2"; shift 2 ;;
38
+ *) echo "Unknown arg: $1"; exit 1 ;;
39
+ esac
40
+ done
41
+
42
+ mkdir -p "$(dirname "$LOG_FILE")"
43
+ exec > >(tee -a "$LOG_FILE") 2>&1
44
+
45
+ log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
46
+
47
+ # ─── 토큰화 헬퍼 (parquet → bin) ─────────────────────────────────────────
48
+ tokenize_parquet() {
49
+ local name="$1"
50
+ local input_pattern="$2"
51
+ local text_col="$3"
52
+ local output="${DATA_DIR}/${name}_train.bin"
53
+
54
+ if [[ -f "$output" && $FROM_STEP -le 0 ]]; then
55
+ log "[SKIP] $output already exists ($(du -h "$output" | cut -f1))"
56
+ return
57
+ fi
58
+
59
+ log "[START] Tokenizing $name from parquet..."
60
+ python3 - <<PYEOF
61
+ import glob, os, sys
62
+ import numpy as np
63
+ from tokenizers import Tokenizer
64
+ import pyarrow.parquet as pq
65
+ from tqdm import tqdm
66
+ from concurrent.futures import ProcessPoolExecutor
67
+ import multiprocessing as mp
68
+
69
+ tokenizer_path = "${TOKENIZER}"
70
+ input_pattern = "${input_pattern}"
71
+ text_col = "${text_col}"
72
+ output_train = "${output}"
73
+ output_val = output_train.replace("_train.bin", "_val.bin")
74
+ val_split = ${VAL_SPLIT}
75
+ seed = ${SEED}
76
+
77
+ files = sorted(glob.glob(input_pattern))
78
+ print(f"Found {len(files)} parquet files")
79
+
80
+ tokenizer = Tokenizer.from_file(tokenizer_path)
81
+
82
+ all_tokens = []
83
+ total_docs = 0
84
+
85
+ for f in tqdm(files, desc="${name}"):
86
+ try:
87
+ table = pq.read_table(f, columns=[text_col])
88
+ for text in table.column(text_col):
89
+ t = text.as_py()
90
+ if t and len(t) > 50:
91
+ ids = tokenizer.encode(t).ids
92
+ all_tokens.extend(ids)
93
+ total_docs += 1
94
+ except Exception as e:
95
+ print(f"Error processing {f}: {e}", file=sys.stderr)
96
+ continue
97
+
98
+ print(f"Total: {total_docs:,} docs, {len(all_tokens):,} tokens")
99
+
100
+ # Split
101
+ import random
102
+ random.seed(seed)
103
+ random.shuffle(all_tokens) # Not ideal but matches existing code
104
+ n_val = int(len(all_tokens) * val_split)
105
+ val_tokens = all_tokens[:n_val]
106
+ train_tokens = all_tokens[n_val:]
107
+
108
+ np.array(train_tokens, dtype=np.uint16).tofile(output_train)
109
+ np.array(val_tokens, dtype=np.uint16).tofile(output_val)
110
+ print(f"Saved: {output_train} ({len(train_tokens):,} tokens)")
111
+ print(f"Saved: {output_val} ({len(val_tokens):,} tokens)")
112
+ PYEOF
113
+ log "[DONE] $name → $output"
114
+ }
115
+
116
+ # ─── Step 1: CulturaX ────────────────────────────────────────────────────
117
+ if [[ $FROM_STEP -le 1 ]]; then
118
+ log "=== Step 1: CulturaX 토큰화 ==="
119
+ tokenize_parquet "culturax" \
120
+ "${EXTRA_DIR}/culturax_ko/ko/*.parquet" \
121
+ "text"
122
+ fi
123
+
124
+ # ─── Step 2: cc100 해제 + 토큰화 ─────────────────────────────────────────
125
+ if [[ $FROM_STEP -le 2 ]]; then
126
+ log "=== Step 2: cc100 해제 + 토큰화 ==="
127
+ CC100_XZ="${EXTRA_DIR}/cc100_ko/ko.txt.xz"
128
+ CC100_TXT="${EXTRA_DIR}/cc100_ko/ko.txt"
129
+ CC100_OUT="${DATA_DIR}/cc100_train.bin"
130
+
131
+ if [[ -f "$CC100_OUT" && $FROM_STEP -le 0 ]]; then
132
+ log "[SKIP] cc100 already tokenized"
133
+ else
134
+ # 해제
135
+ if [[ ! -f "$CC100_TXT" ]]; then
136
+ log "Decompressing cc100 xz (14GB → 54GB)..."
137
+ xz -dk "$CC100_XZ"
138
+ log "Decompression done"
139
+ fi
140
+
141
+ # 토큰화 (대용량 → 스트리밍)
142
+ log "Tokenizing cc100 (54GB text)..."
143
+ python3 - <<'PYEOF'
144
+ import numpy as np
145
+ from tokenizers import Tokenizer
146
+ from tqdm import tqdm
147
+ import random
148
+
149
+ tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
150
+ input_file = "data/korean_extra/cc100_ko/ko.txt"
151
+ output_train = "data/cc100_train.bin"
152
+ output_val = "data/cc100_val.bin"
153
+
154
+ # Stream tokenize in chunks
155
+ all_tokens = []
156
+ doc_buffer = []
157
+ doc_count = 0
158
+
159
+ with open(input_file, 'r', encoding='utf-8', errors='replace') as f:
160
+ for line in tqdm(f, desc="cc100", unit=" lines"):
161
+ line = line.strip()
162
+ if not line:
163
+ # Document boundary
164
+ if doc_buffer:
165
+ text = '\n'.join(doc_buffer)
166
+ if len(text) > 50:
167
+ ids = tokenizer.encode(text).ids
168
+ all_tokens.extend(ids)
169
+ doc_count += 1
170
+ doc_buffer = []
171
+ else:
172
+ doc_buffer.append(line)
173
+
174
+ # Last doc
175
+ if doc_buffer:
176
+ text = '\n'.join(doc_buffer)
177
+ if len(text) > 50:
178
+ all_tokens.extend(tokenizer.encode(text).ids)
179
+ doc_count += 1
180
+
181
+ print(f"Total: {doc_count:,} docs, {len(all_tokens):,} tokens")
182
+
183
+ # Split
184
+ n_val = int(len(all_tokens) * 0.002)
185
+ np.array(all_tokens[n_val:], dtype=np.uint16).tofile(output_train)
186
+ np.array(all_tokens[:n_val], dtype=np.uint16).tofile(output_val)
187
+ print(f"Saved train: {len(all_tokens)-n_val:,} tokens")
188
+ print(f"Saved val: {n_val:,} tokens")
189
+ PYEOF
190
+ log "[DONE] cc100"
191
+ fi
192
+ fi
193
+
194
+ # ─── Step 3: OSCAR ───────────────────────────────────────────────────────
195
+ if [[ $FROM_STEP -le 3 ]]; then
196
+ log "=== Step 3: OSCAR 토큰화 ==="
197
+ OSCAR_OUT="${DATA_DIR}/oscar_train.bin"
198
+
199
+ if [[ -f "$OSCAR_OUT" && $FROM_STEP -le 0 ]]; then
200
+ log "[SKIP] OSCAR already tokenized"
201
+ else
202
+ python3 - <<'PYEOF'
203
+ import glob, numpy as np
204
+ from tokenizers import Tokenizer
205
+ import pyarrow.parquet as pq
206
+ from tqdm import tqdm
207
+
208
+ tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
209
+ files = sorted(glob.glob("data/korean_extra/oscar_ko/data/kor_Hang/*.parquet"))
210
+ all_tokens = []
211
+ doc_count = 0
212
+
213
+ for f in tqdm(files, desc="OSCAR"):
214
+ table = pq.read_table(f, columns=['text'])
215
+ for row in table.column('text'):
216
+ if row is None:
217
+ continue
218
+ parts = row.as_py()
219
+ if parts:
220
+ text = '\n'.join(item['text'] for item in parts if item and item.get('text'))
221
+ if len(text) > 50:
222
+ all_tokens.extend(tokenizer.encode(text).ids)
223
+ doc_count += 1
224
+
225
+ print(f"OSCAR: {doc_count:,} docs, {len(all_tokens):,} tokens")
226
+ n_val = int(len(all_tokens) * 0.002)
227
+ np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/oscar_train.bin")
228
+ np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/oscar_val.bin")
229
+ PYEOF
230
+ log "[DONE] OSCAR"
231
+ fi
232
+ fi
233
+
234
+ # ─── Step 4: korean_webtext ──────────────────────────────────────────────
235
+ if [[ $FROM_STEP -le 4 ]]; then
236
+ log "=== Step 4: korean_webtext 토큰화 ==="
237
+ tokenize_parquet "webtext" \
238
+ "${EXTRA_DIR}/korean_webtext/data/*.parquet" \
239
+ "text"
240
+ fi
241
+
242
+ # ─── Step 5: HPLT 한국어 추출 + 토큰화 ──────────────────────────────────
243
+ if [[ $FROM_STEP -le 5 ]]; then
244
+ log "=== Step 5: HPLT 한국어 추출 + 토큰화 ==="
245
+ HPLT_OUT="${DATA_DIR}/hplt_ko_train.bin"
246
+
247
+ if [[ -f "$HPLT_OUT" && $FROM_STEP -le 0 ]]; then
248
+ log "[SKIP] HPLT already tokenized"
249
+ else
250
+ python3 - <<'PYEOF'
251
+ import glob, numpy as np
252
+ from tokenizers import Tokenizer
253
+ import pyarrow.parquet as pq
254
+ from tqdm import tqdm
255
+
256
+ tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
257
+ files = sorted(glob.glob("data/korean_extra/hplt_ko/en-ko/*.parquet"))
258
+ all_tokens = []
259
+ doc_count = 0
260
+
261
+ for f in tqdm(files, desc="HPLT"):
262
+ table = pq.read_table(f, columns=['tgt_doc'])
263
+ for row in table.column('tgt_doc'):
264
+ d = row.as_py()
265
+ if d and d.get('sentences'):
266
+ text = '\n'.join(s for s in d['sentences'] if s)
267
+ if len(text) > 50:
268
+ all_tokens.extend(tokenizer.encode(text).ids)
269
+ doc_count += 1
270
+
271
+ print(f"HPLT Korean: {doc_count:,} docs, {len(all_tokens):,} tokens")
272
+ n_val = int(len(all_tokens) * 0.002)
273
+ np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/hplt_ko_train.bin")
274
+ np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/hplt_ko_val.bin")
275
+ PYEOF
276
+ log "[DONE] HPLT"
277
+ fi
278
+ fi
279
+
280
+ # ─── Step 6: textbooks + finepdfs + kovast ───────────────────────────────
281
+ if [[ $FROM_STEP -le 6 ]]; then
282
+ log "=== Step 6: 기타 소스 토큰화 ==="
283
+ EXTRA_OUT="${DATA_DIR}/extra_misc_train.bin"
284
+
285
+ if [[ -f "$EXTRA_OUT" && $FROM_STEP -le 0 ]]; then
286
+ log "[SKIP] extra_misc already tokenized"
287
+ else
288
+ python3 - <<'PYEOF'
289
+ import glob, numpy as np, os
290
+ from tokenizers import Tokenizer
291
+ import pyarrow.parquet as pq
292
+ from tqdm import tqdm
293
+
294
+ tokenizer = Tokenizer.from_file("tokenizer/tokenizer.json")
295
+ all_tokens = []
296
+ doc_count = 0
297
+
298
+ # korean_textbooks (MMLU-style: look for text columns)
299
+ tb_files = glob.glob("data/korean_extra/korean_textbooks/**/*.parquet", recursive=True)
300
+ for f in tqdm(tb_files, desc="textbooks"):
301
+ try:
302
+ table = pq.read_table(f)
303
+ # Try common text columns
304
+ for col in ['question', 'text', 'input', 'instruction']:
305
+ if col in table.column_names:
306
+ for val in table.column(col):
307
+ t = val.as_py()
308
+ if t and len(t) > 20:
309
+ all_tokens.extend(tokenizer.encode(t).ids)
310
+ doc_count += 1
311
+ break
312
+ except:
313
+ continue
314
+
315
+ # finepdfs
316
+ pdf_files = glob.glob("data/korean_extra/finepdfs_edu_ko/*.parquet")
317
+ for f in tqdm(pdf_files, desc="finepdfs"):
318
+ try:
319
+ table = pq.read_table(f)
320
+ for col in ['text', 'content']:
321
+ if col in table.column_names:
322
+ for val in table.column(col):
323
+ t = val.as_py()
324
+ if t and len(t) > 50:
325
+ all_tokens.extend(tokenizer.encode(t).ids)
326
+ doc_count += 1
327
+ break
328
+ except:
329
+ continue
330
+
331
+ print(f"Extra: {doc_count:,} docs, {len(all_tokens):,} tokens")
332
+ n_val = int(len(all_tokens) * 0.002)
333
+ np.array(all_tokens[n_val:], dtype=np.uint16).tofile("data/extra_misc_train.bin")
334
+ np.array(all_tokens[:n_val], dtype=np.uint16).tofile("data/extra_misc_val.bin")
335
+ PYEOF
336
+ log "[DONE] extra_misc"
337
+ fi
338
+ fi
339
+
340
+ # ─── Step 7: 전체 병합 ──────────────────────────────────────────────────
341
+ if [[ $FROM_STEP -le 7 ]]; then
342
+ log "=== Step 7: 전체 병합 ==="
343
+
344
+ TRAIN_BINS=""
345
+ for f in \
346
+ "${DATA_DIR}/korean_train.bin" \
347
+ "${DATA_DIR}/culturax_train.bin" \
348
+ "${DATA_DIR}/cc100_train.bin" \
349
+ "${DATA_DIR}/oscar_train.bin" \
350
+ "${DATA_DIR}/webtext_train.bin" \
351
+ "${DATA_DIR}/hplt_ko_train.bin" \
352
+ "${DATA_DIR}/extra_misc_train.bin"; do
353
+ if [[ -f "$f" ]]; then
354
+ TRAIN_BINS="$TRAIN_BINS $f"
355
+ log " Including: $f ($(du -h "$f" | cut -f1))"
356
+ else
357
+ log " [WARN] Missing: $f"
358
+ fi
359
+ done
360
+
361
+ if [[ -n "$TRAIN_BINS" ]]; then
362
+ python3 data/merge_bins.py $TRAIN_BINS "${DATA_DIR}/merged_3b_train.bin"
363
+ log "[DONE] merged_3b_train.bin created"
364
+ fi
365
+
366
+ # Val 병합
367
+ VAL_BINS=""
368
+ for f in \
369
+ "${DATA_DIR}/korean_val.bin" \
370
+ "${DATA_DIR}/culturax_val.bin" \
371
+ "${DATA_DIR}/cc100_val.bin" \
372
+ "${DATA_DIR}/oscar_val.bin" \
373
+ "${DATA_DIR}/webtext_val.bin" \
374
+ "${DATA_DIR}/hplt_ko_val.bin" \
375
+ "${DATA_DIR}/extra_misc_val.bin"; do
376
+ if [[ -f "$f" ]]; then
377
+ VAL_BINS="$VAL_BINS $f"
378
+ fi
379
+ done
380
+
381
+ if [[ -n "$VAL_BINS" ]]; then
382
+ python3 data/merge_bins.py $VAL_BINS "${DATA_DIR}/merged_3b_val.bin"
383
+ log "[DONE] merged_3b_val.bin created"
384
+ fi
385
+ fi
386
+
387
+ # ─── Step 8: 검증 ────────────────────────────────────────────────────────
388
+ if [[ $FROM_STEP -le 8 ]]; then
389
+ log "=== Step 8: 최종 검증 ==="
390
+ python3 - <<'PYEOF'
391
+ import os, glob
392
+ import numpy as np
393
+
394
+ print("=== 토큰화 결과 ===")
395
+ total_train = 0
396
+ total_val = 0
397
+ for f in sorted(glob.glob("data/*_train.bin") + glob.glob("data/train.bin")):
398
+ n = os.path.getsize(f) // 2
399
+ total_train += n
400
+ print(f" {os.path.basename(f):30s}: {n:>15,} tokens ({os.path.getsize(f)/1e9:.2f} GB)")
401
+
402
+ for f in sorted(glob.glob("data/*_val.bin") + glob.glob("data/val.bin")):
403
+ n = os.path.getsize(f) // 2
404
+ total_val += n
405
+
406
+ print(f"\n Total train: {total_train:,} tokens ({total_train/1e9:.1f}B)")
407
+ print(f" Total val: {total_val:,} tokens ({total_val/1e6:.1f}M)")
408
+ print(f"\n 3B Chinchilla minimum: 60B tokens")
409
+ print(f" Epochs needed for 60B: {60e9/total_train:.1f}")
410
+ print(f" Epochs needed for 100B: {100e9/total_train:.1f}")
411
+ PYEOF
412
+ fi
413
+
414
+ log "=== 파이프라인 완료 ==="
source/scripts/prepare_sft_combined.sh ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # prepare_sft_combined.sh — 3B SFT용 전체 데이터 통합
3
+ # 모든 SFT 데이터를 하나의 train/val 파일로 합침
4
+ #
5
+ # 업데이트 (2026-03-02): sft_extra 신규 소스 추가
6
+ # - nayohan_Evol-Instruct-Code-80k-v1-ko (코드 instruction)
7
+ # - FreedomIntelligence_alpaca-gpt4-korean (GPT-4 alpaca 한국어)
8
+ # - FreedomIntelligence_evol-instruct-korean (evol-instruct 한국어)
9
+ # - coastral_korean-writing-style-instruct (한국어 글쓰기 스타일)
10
+ # - maywell_ko_wikidata_QA (위키데이터 QA)
11
+ # - OpenAssistant_oasst1_ko (OASST1 한국어, 트리 재구성)
12
+ # - Bllossom_evol-instruct-ko (존재 확인 후 로드)
13
+ set -euo pipefail
14
+ BASE="$(cd "$(dirname "$0")/.." && pwd)"
15
+ OUT_DIR="$BASE/data/sft_combined"
16
+ mkdir -p "$OUT_DIR"
17
+
18
+ python3 << 'PYEOF'
19
+ import json, random, os, glob, hashlib
20
+ from collections import defaultdict
21
+
22
+ BASE = "/PROJECT/0325120031_A/ghong/taketimes/llm-bang/data"
23
+ OUT_TRAIN = f"{BASE}/sft_combined/train.jsonl"
24
+ OUT_VAL = f"{BASE}/sft_combined/val.jsonl"
25
+ VAL_RATIO = 0.02
26
+ SEED = 42
27
+
28
+ # SFT 소스 파일 목록 (chat 포맷으로 변환 가능한 것들)
29
+ SOURCES = [
30
+ # (path, fmt) fmt: "messages" | "auto" | "oasst"
31
+ (f"{BASE}/sft/train.jsonl", "messages"),
32
+ (f"{BASE}/sft_extra/ultrachat_200k/train_sft.jsonl", "messages"),
33
+ (f"{BASE}/sft_extra/open_korean_instructions/train.jsonl", "messages"),
34
+ (f"{BASE}/sft_extra/korean_instruction_mix/train.jsonl", "messages"),
35
+ (f"{BASE}/sft_extra/openhermes_2.5/train.jsonl", "messages"),
36
+ (f"{BASE}/sft_extra/magpie_reasoning_v2/train.jsonl", "messages"),
37
+ (f"{BASE}/sft_extra/magpie_reasoning_ko/train.jsonl", "messages"),
38
+ (f"{BASE}/sft_extra/reasoning_r1_1.4m/train.jsonl", "messages"),
39
+ (f"{BASE}/sft_extra/lemon-mint_smol-koreantalk.jsonl", "auto"),
40
+ (f"{BASE}/sft_extra/dbdu_ShareGPT-74k-ko.jsonl", "auto"),
41
+ (f"{BASE}/sft_extra/ko_lima/data.jsonl", "auto"),
42
+ (f"{BASE}/sft_extra/koalpaca_v1_1a/data.jsonl", "auto"),
43
+ (f"{BASE}/sft_extra/kullm_v2/data.jsonl", "auto"),
44
+ (f"{BASE}/sft_extra/kuotient_orca-math-word-problems-193k-korean.jsonl", "auto"),
45
+ (f"{BASE}/sft_extra/kyujinpy_KOR-OpenOrca-Platypus-v3/data.jsonl", "auto"),
46
+ (f"{BASE}/sft_extra/nlp-with-deeplearning_Ko.WizardLM_evol_instruct_V2_196k.jsonl", "auto"),
47
+ (f"{BASE}/sft_extra/AI-MO_NuminaMath-CoT/data.jsonl", "auto"),
48
+ (f"{BASE}/sft_extra/zwhe99_DeepMath-103K/data.jsonl", "auto"),
49
+ # ---- 신규 소스 (2026-03-02) ----
50
+ (f"{BASE}/sft_extra/nayohan_Evol-Instruct-Code-80k-v1-ko/data.jsonl", "auto"),
51
+ (f"{BASE}/sft_extra/FreedomIntelligence_alpaca-gpt4-korean.jsonl", "auto"),
52
+ (f"{BASE}/sft_extra/FreedomIntelligence_evol-instruct-korean.jsonl", "auto"),
53
+ (f"{BASE}/sft_extra/coastral_korean-writing-style-instruct.jsonl", "auto"),
54
+ (f"{BASE}/sft_extra/maywell_ko_wikidata_QA.jsonl", "auto"),
55
+ (f"{BASE}/sft_extra/OpenAssistant_oasst1_ko.jsonl", "oasst"),
56
+ (f"{BASE}/sft_extra/Bllossom_evol-instruct-ko/data.jsonl", "auto"),
57
+ ]
58
+
59
+ def to_messages(obj):
60
+ """다양한 포맷을 통일된 messages 포맷으로 변환"""
61
+ # 이미 messages 포맷
62
+ if 'messages' in obj and isinstance(obj['messages'], list):
63
+ return obj['messages']
64
+ # conversations 포맷
65
+ if 'conversations' in obj:
66
+ msgs = []
67
+ for turn in obj['conversations']:
68
+ role = turn.get('from', turn.get('role', ''))
69
+ content = turn.get('value', turn.get('content', ''))
70
+ if role in ('human', 'user', 'prompter'):
71
+ msgs.append({'role': 'user', 'content': content})
72
+ elif role in ('gpt', 'assistant', 'bot'):
73
+ msgs.append({'role': 'assistant', 'content': content})
74
+ return msgs if len(msgs) >= 2 else None
75
+ # instruction/output 포맷
76
+ if 'instruction' in obj:
77
+ instruction = obj['instruction']
78
+ inp = obj.get('input', '')
79
+ output = obj.get('output', obj.get('response', ''))
80
+ if not output: return None
81
+ user_content = instruction + ('\n\n' + inp if inp else '')
82
+ return [{'role': 'user', 'content': user_content}, {'role': 'assistant', 'content': output}]
83
+ # question/answer 포맷
84
+ if 'question' in obj and 'answer' in obj:
85
+ return [{'role': 'user', 'content': obj['question']}, {'role': 'assistant', 'content': obj['answer']}]
86
+ # prompt/response
87
+ if 'prompt' in obj and ('response' in obj or 'completion' in obj):
88
+ resp = obj.get('response', obj.get('completion', ''))
89
+ return [{'role': 'user', 'content': obj['prompt']}, {'role': 'assistant', 'content': resp}]
90
+ # problem/solution
91
+ if 'problem' in obj and 'solution' in obj:
92
+ return [{'role': 'user', 'content': obj['problem']}, {'role': 'assistant', 'content': obj['solution']}]
93
+ return None
94
+
95
+
96
+ def load_oasst(path):
97
+ """
98
+ OpenAssistant OASST1 flat message 포맷을 대화 트리로 재구성.
99
+ 각 루트(prompter) 메시지에서 best-ranked assistant 응답(rank=0.0)을
100
+ 따라 단일 대화 스레드를 추출한다.
101
+ deleted=True 메시지와 review_result=False 메시지는 제외.
102
+ """
103
+ nodes = {} # message_id → obj
104
+ children = defaultdict(list) # parent_id → [child_obj, ...]
105
+
106
+ with open(path, 'r', errors='replace') as f:
107
+ for line in f:
108
+ line = line.strip()
109
+ if not line:
110
+ continue
111
+ try:
112
+ obj = json.loads(line)
113
+ except Exception:
114
+ continue
115
+ if obj.get('deleted', False):
116
+ continue
117
+ if obj.get('review_result') is False:
118
+ continue
119
+ mid = obj.get('message_id')
120
+ if mid:
121
+ nodes[mid] = obj
122
+ pid = obj.get('parent_id')
123
+ if pid:
124
+ children[pid].append(obj)
125
+
126
+ # 자식 목록을 rank 오름차순 정렬 (rank=null은 뒤로)
127
+ def sort_key(c):
128
+ r = c.get('rank')
129
+ mid = c.get('message_id', '')
130
+ return (1, 0, mid) if r is None else (0, r, mid)
131
+ for pid in children:
132
+ children[pid].sort(key=sort_key)
133
+
134
+ samples = []
135
+
136
+ def build_thread(node, current_msgs):
137
+ """재귀적으로 대화 스레드를 따라 samples에 추가."""
138
+ role = node.get('role', '')
139
+ text = node.get('text', '')
140
+ if role == 'prompter':
141
+ mapped_role = 'user'
142
+ elif role == 'assistant':
143
+ mapped_role = 'assistant'
144
+ else:
145
+ return
146
+
147
+ msgs = current_msgs + [{'role': mapped_role, 'content': text}]
148
+
149
+ # 유효한 user→assistant 쌍이 있을 때만 샘플 추가
150
+ if mapped_role == 'assistant' and len(msgs) >= 2:
151
+ samples.append({'messages': msgs})
152
+
153
+ # 자식 중 best (rank=0.0) 하나만 따라간다 (가장 품질 높은 경로)
154
+ kids = children.get(node.get('message_id'), [])
155
+ if kids:
156
+ build_thread(kids[0], msgs)
157
+
158
+ # 루트 노드: parent_id가 없는 prompter 메시지
159
+ roots = [n for n in nodes.values() if n.get('parent_id') is None and n.get('role') == 'prompter']
160
+ for root in roots:
161
+ build_thread(root, [])
162
+
163
+ return samples
164
+
165
+
166
+ random.seed(SEED)
167
+ all_samples = []
168
+
169
+ for path, fmt in SOURCES:
170
+ if not os.path.exists(path):
171
+ print(f"[SKIP] {path}")
172
+ continue
173
+
174
+ if fmt == "oasst":
175
+ samples = load_oasst(path)
176
+ all_samples.extend(samples)
177
+ print(f"[LOADED] {os.path.basename(path)}: {len(samples):,} samples (oasst tree)")
178
+ continue
179
+
180
+ count = 0
181
+ with open(path, 'r', errors='replace') as f:
182
+ for line in f:
183
+ line = line.strip()
184
+ if not line: continue
185
+ try:
186
+ obj = json.loads(line)
187
+ except Exception:
188
+ continue
189
+ if fmt == "messages":
190
+ msgs = obj.get('messages') or obj.get('conversations')
191
+ if msgs:
192
+ all_samples.append({'messages': msgs})
193
+ count += 1
194
+ else: # auto detect
195
+ msgs = to_messages(obj)
196
+ if msgs and len(msgs) >= 2:
197
+ all_samples.append({'messages': msgs})
198
+ count += 1
199
+ print(f"[LOADED] {os.path.basename(path)}: {count:,} samples")
200
+ if count == 0:
201
+ print(f"[WARN] {os.path.basename(path)}: 0 samples extracted (format detection may have failed)")
202
+
203
+ print(f"\n총 샘플: {len(all_samples):,}")
204
+
205
+ # ---- Deduplication (MD5 of first user message) ----
206
+ seen_hashes = set()
207
+ unique_samples = []
208
+ dup_count = 0
209
+ for s in all_samples:
210
+ msgs = s.get('messages', [])
211
+ first_user = next((m['content'] for m in msgs if m.get('role') == 'user'), '')
212
+ h = hashlib.md5(first_user.encode('utf-8', errors='replace')).hexdigest()
213
+ if h in seen_hashes:
214
+ dup_count += 1
215
+ continue
216
+ seen_hashes.add(h)
217
+ unique_samples.append(s)
218
+
219
+ print(f"[DEDUP] 제거: {dup_count:,}, 남은 샘플: {len(unique_samples):,}")
220
+ all_samples = unique_samples
221
+
222
+ # ---- Format validation ----
223
+ def validate_messages(msgs):
224
+ """Check messages have valid role/content structure."""
225
+ if not isinstance(msgs, list) or len(msgs) < 2:
226
+ return False
227
+ for m in msgs:
228
+ if not isinstance(m, dict):
229
+ return False
230
+ if m.get('role') not in ('user', 'assistant', 'system'):
231
+ return False
232
+ if not isinstance(m.get('content'), str):
233
+ return False
234
+ return True
235
+
236
+ valid_samples = []
237
+ invalid_count = 0
238
+ for s in all_samples:
239
+ if validate_messages(s.get('messages', [])):
240
+ valid_samples.append(s)
241
+ else:
242
+ invalid_count += 1
243
+
244
+ print(f"[VALIDATE] 유효하지 않은 포맷 제거: {invalid_count:,}, 남은 샘플: {len(valid_samples):,}")
245
+ all_samples = valid_samples
246
+
247
+ random.shuffle(all_samples)
248
+
249
+ n_val = int(len(all_samples) * VAL_RATIO)
250
+ val_samples = all_samples[:n_val]
251
+ train_samples = all_samples[n_val:]
252
+
253
+ os.makedirs(os.path.dirname(OUT_TRAIN), exist_ok=True)
254
+ with open(OUT_TRAIN, 'w') as f:
255
+ for s in train_samples:
256
+ f.write(json.dumps(s, ensure_ascii=False) + '\n')
257
+ with open(OUT_VAL, 'w') as f:
258
+ for s in val_samples:
259
+ f.write(json.dumps(s, ensure_ascii=False) + '\n')
260
+
261
+ print(f"[DONE] train: {len(train_samples):,} → {OUT_TRAIN}")
262
+ print(f"[DONE] val: {len(val_samples):,} → {OUT_VAL}")
263
+ PYEOF
264
+ echo "SFT 데이터 병합 완료"
source/scripts/quality_gate.sh ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # quality_gate.sh — Phase 완료 자동 품질 게이트 검증
4
+ #
5
+ # Usage:
6
+ # bash scripts/quality_gate.sh <phase>
7
+ #
8
+ # Phases:
9
+ # pretrain — 사전학습 게이트 (val_loss, loss 단조 감소)
10
+ # sft — SFT 게이트 (val_loss 수렴, 반복률, KoBEST)
11
+ # orpo — ORPO 게이트 (반복률, KoBEST, chosen > rejected)
12
+ # deploy — 배포 게이트 (GGUF perplexity, Ollama 응답)
13
+ # all — 모든 게이트 순차 실행
14
+ #
15
+ # Exit codes:
16
+ # 0 — 게이트 통과
17
+ # 1 — 게이트 실패 (기준 미달)
18
+ # 2 — 필수 파일 / 의존성 없음 (실행 불가)
19
+ # =============================================================================
20
+ set -uo pipefail
21
+
22
+ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # 색상 출력 헬퍼
26
+ # ---------------------------------------------------------------------------
27
+ _RED='\033[0;31m'
28
+ _GREEN='\033[0;32m'
29
+ _YELLOW='\033[1;33m'
30
+ _BLUE='\033[0;34m'
31
+ _NC='\033[0m'
32
+
33
+ log_info() { echo -e "${_BLUE}[INFO]${_NC} $*"; }
34
+ log_ok() { echo -e "${_GREEN}[PASS]${_NC} $*"; }
35
+ log_warn() { echo -e "${_YELLOW}[WARN]${_NC} $*"; }
36
+ log_fail() { echo -e "${_RED}[FAIL]${_NC} $*"; }
37
+ log_skip() { echo -e " [SKIP] $*"; }
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # 유틸리티: Python 한 줄 표현식 평가 (부동소수점 비교)
41
+ # ---------------------------------------------------------------------------
42
+ py_eval() {
43
+ python3 -c "import sys; sys.exit(0 if ($1) else 1)"
44
+ }
45
+
46
+ py_value() {
47
+ python3 -c "print($1)"
48
+ }
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # 유틸리티: JSON에서 값 추출
52
+ # ---------------------------------------------------------------------------
53
+ json_get() {
54
+ local file="$1" key="$2"
55
+ python3 -c "
56
+ import json, sys
57
+ try:
58
+ d = json.load(open('$file'))
59
+ keys = '$key'.split('.')
60
+ for k in keys:
61
+ d = d[k]
62
+ print(d)
63
+ except Exception as e:
64
+ print('NOT_FOUND')
65
+ sys.exit(1)
66
+ "
67
+ }
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # 게이트 결과 집계
71
+ # ---------------------------------------------------------------------------
72
+ GATE_PASS=0
73
+ GATE_FAIL=0
74
+ GATE_SKIP=0
75
+
76
+ record_pass() { GATE_PASS=$((GATE_PASS + 1)); log_ok "$*"; }
77
+ record_fail() { GATE_FAIL=$((GATE_FAIL + 1)); log_fail "$*"; }
78
+ record_skip() { GATE_SKIP=$((GATE_SKIP + 1)); log_skip "$*"; }
79
+
80
+ # =============================================================================
81
+ # Gate 1: Pretrain
82
+ # =============================================================================
83
+ gate_pretrain() {
84
+ echo ""
85
+ echo "=================================================================="
86
+ echo " Gate: PRETRAIN"
87
+ echo " 기준: val_loss < 2.5 | loss 단조 감소 확인"
88
+ echo "=================================================================="
89
+
90
+ # 최신 체크포인트 디렉토리 탐색
91
+ CKPT_BASE="$PROJECT_DIR/checkpoints"
92
+ METRICS_FILE=""
93
+
94
+ # metrics.json 또는 train_log.jsonl 탐색
95
+ for candidate in \
96
+ "$CKPT_BASE/korean_3b_fp8_pretrain/metrics.json" \
97
+ "$CKPT_BASE/korean_3b_pretrain/metrics.json" \
98
+ "$PROJECT_DIR/outputs/pretrain_metrics.json" \
99
+ "$PROJECT_DIR/logs/pretrain_metrics.json"
100
+ do
101
+ if [[ -f "$candidate" ]]; then
102
+ METRICS_FILE="$candidate"
103
+ break
104
+ fi
105
+ done
106
+
107
+ if [[ -z "$METRICS_FILE" ]]; then
108
+ log_warn "사전학습 메트릭 파일을 찾을 수 없습니다."
109
+ log_warn "찾는 경로: $CKPT_BASE/korean_3b_*/metrics.json"
110
+ log_warn "메트릭 파일이 없으면 학습 스크립트에서 아래 형식으로 저장하세요:"
111
+ log_warn ' {"val_loss": 2.3, "loss_history": [3.1, 2.8, 2.5, 2.3]}'
112
+ record_skip "메트릭 파일 없음 — 게이트 건너뜀"
113
+ return 0
114
+ fi
115
+
116
+ log_info "메트릭 파일: $METRICS_FILE"
117
+
118
+ # val_loss 확인
119
+ VAL_LOSS=$(json_get "$METRICS_FILE" "val_loss" 2>/dev/null || echo "NOT_FOUND")
120
+ if [[ "$VAL_LOSS" == "NOT_FOUND" ]]; then
121
+ record_skip "val_loss 키 없음 — 건너뜀"
122
+ else
123
+ log_info "val_loss = $VAL_LOSS (기준: < 2.5)"
124
+ if py_eval "$VAL_LOSS < 2.5" 2>/dev/null; then
125
+ record_pass "val_loss $VAL_LOSS < 2.5"
126
+ else
127
+ record_fail "val_loss $VAL_LOSS >= 2.5 (기준 미달)"
128
+ fi
129
+ fi
130
+
131
+ # loss 단조 감소 확인 (loss_history)
132
+ python3 - "$METRICS_FILE" <<'PYEOF'
133
+ import json, sys
134
+
135
+ metrics_file = sys.argv[1]
136
+ try:
137
+ d = json.load(open(metrics_file))
138
+ history = d.get("loss_history", [])
139
+ except Exception as e:
140
+ print(f"[SKIP] loss_history 읽기 실패: {e}")
141
+ sys.exit(0)
142
+
143
+ if len(history) < 2:
144
+ print(f"[SKIP] loss_history 데이터 부�� ({len(history)}개)")
145
+ sys.exit(0)
146
+
147
+ # 전체 추세가 감소하는지 확인 (처음 1/4 vs 마지막 1/4 평균 비교)
148
+ n = len(history)
149
+ q = max(1, n // 4)
150
+ early_avg = sum(history[:q]) / q
151
+ late_avg = sum(history[-q:]) / q
152
+
153
+ if late_avg < early_avg:
154
+ print(f"[PASS] loss 단조 감소 확인: 초기 avg={early_avg:.4f} → 최근 avg={late_avg:.4f}")
155
+ sys.exit(0)
156
+ else:
157
+ print(f"[FAIL] loss 감소 미확인: 초기 avg={early_avg:.4f}, 최근 avg={late_avg:.4f}")
158
+ sys.exit(1)
159
+ PYEOF
160
+ local mono_exit=$?
161
+ if [[ $mono_exit -eq 0 ]]; then
162
+ GATE_PASS=$((GATE_PASS + 1))
163
+ elif [[ $mono_exit -eq 1 ]]; then
164
+ GATE_FAIL=$((GATE_FAIL + 1))
165
+ fi
166
+ # exit 0 (SKIP) 는 이미 처리됨
167
+ }
168
+
169
+ # =============================================================================
170
+ # Gate 2: SFT
171
+ # =============================================================================
172
+ gate_sft() {
173
+ echo ""
174
+ echo "=================================================================="
175
+ echo " Gate: SFT"
176
+ echo " 기준: val_loss 수렴 | 반복률 < 15% | KoBEST > 55%"
177
+ echo "=================================================================="
178
+
179
+ METRICS_FILE=""
180
+ for candidate in \
181
+ "$PROJECT_DIR/outputs/sft_metrics.json" \
182
+ "$PROJECT_DIR/logs/sft_metrics.json" \
183
+ "$PROJECT_DIR/checkpoints/sft/metrics.json"
184
+ do
185
+ if [[ -f "$candidate" ]]; then
186
+ METRICS_FILE="$candidate"
187
+ break
188
+ fi
189
+ done
190
+
191
+ if [[ -z "$METRICS_FILE" ]]; then
192
+ log_warn "SFT 메트릭 파일을 찾을 수 없습니다."
193
+ log_warn ' {"val_loss": 1.8, "rep_rate": 0.08, "kobest_score": 0.62}'
194
+ record_skip "SFT 메트릭 파일 없음 — 게이트 건너뜀"
195
+ return 0
196
+ fi
197
+
198
+ log_info "메트릭 파일: $METRICS_FILE"
199
+
200
+ # val_loss 수렴 (상대 변화율 < 1% — 마지막 두 체크포인트)
201
+ python3 - "$METRICS_FILE" <<'PYEOF'
202
+ import json, sys
203
+
204
+ metrics_file = sys.argv[1]
205
+ try:
206
+ d = json.load(open(metrics_file))
207
+ history = d.get("val_loss_history", [])
208
+ except Exception as e:
209
+ print(f"[SKIP] val_loss_history 읽기 실패: {e}")
210
+ sys.exit(0)
211
+
212
+ if len(history) < 2:
213
+ # 단일 val_loss만 있으면 단순 확인
214
+ val_loss = d.get("val_loss")
215
+ if val_loss is not None:
216
+ print(f"[INFO] val_loss = {val_loss} (수렴 히스토리 없음 — 단일 값 확인 건너뜀)")
217
+ sys.exit(0)
218
+
219
+ last = history[-1]
220
+ second = history[-2]
221
+ rel_change = abs(last - second) / max(abs(second), 1e-9)
222
+
223
+ if rel_change < 0.01:
224
+ print(f"[PASS] val_loss 수렴 (상대변화율 {rel_change*100:.3f}% < 1%): {second:.4f} → {last:.4f}")
225
+ sys.exit(0)
226
+ else:
227
+ print(f"[FAIL] val_loss 미수렴 (상대변화율 {rel_change*100:.3f}% >= 1%): {second:.4f} → {last:.4f}")
228
+ sys.exit(1)
229
+ PYEOF
230
+ local conv_exit=$?
231
+ [[ $conv_exit -eq 0 ]] && GATE_PASS=$((GATE_PASS + 1)) || GATE_FAIL=$((GATE_FAIL + 1))
232
+
233
+ # 반복률 확인
234
+ REP_RATE=$(json_get "$METRICS_FILE" "rep_rate" 2>/dev/null || echo "NOT_FOUND")
235
+ if [[ "$REP_RATE" == "NOT_FOUND" ]]; then
236
+ record_skip "rep_rate 키 없음 — 건너뜀"
237
+ else
238
+ REP_PCT=$(py_value "$REP_RATE * 100")
239
+ log_info "반복률 = ${REP_PCT}% (기준: < 15%)"
240
+ if py_eval "$REP_RATE < 0.15" 2>/dev/null; then
241
+ record_pass "반복률 ${REP_PCT}% < 15%"
242
+ else
243
+ record_fail "반복률 ${REP_PCT}% >= 15% (기준 미달)"
244
+ fi
245
+ fi
246
+
247
+ # KoBEST 확인
248
+ KOBEST=$(json_get "$METRICS_FILE" "kobest_score" 2>/dev/null || echo "NOT_FOUND")
249
+ if [[ "$KOBEST" == "NOT_FOUND" ]]; then
250
+ record_skip "kobest_score 키 없음 — 건너뜀"
251
+ else
252
+ KOBEST_PCT=$(py_value "$KOBEST * 100")
253
+ log_info "KoBEST = ${KOBEST_PCT}% (기준: > 55%)"
254
+ if py_eval "$KOBEST > 0.55" 2>/dev/null; then
255
+ record_pass "KoBEST ${KOBEST_PCT}% > 55%"
256
+ else
257
+ record_fail "KoBEST ${KOBEST_PCT}% <= 55% (기준 미달)"
258
+ fi
259
+ fi
260
+ }
261
+
262
+ # =============================================================================
263
+ # Gate 3: ORPO
264
+ # =============================================================================
265
+ gate_orpo() {
266
+ echo ""
267
+ echo "=================================================================="
268
+ echo " Gate: ORPO"
269
+ echo " 기준: 반복률 < 5% | KoBEST > 60% | chosen > rejected 90%+"
270
+ echo "=================================================================="
271
+
272
+ METRICS_FILE=""
273
+ for candidate in \
274
+ "$PROJECT_DIR/outputs/orpo_metrics.json" \
275
+ "$PROJECT_DIR/logs/orpo_metrics.json" \
276
+ "$PROJECT_DIR/checkpoints/orpo/metrics.json"
277
+ do
278
+ if [[ -f "$candidate" ]]; then
279
+ METRICS_FILE="$candidate"
280
+ break
281
+ fi
282
+ done
283
+
284
+ if [[ -z "$METRICS_FILE" ]]; then
285
+ log_warn "ORPO 메트릭 파일을 찾을 수 없습니다."
286
+ log_warn ' {"rep_rate": 0.03, "kobest_score": 0.63, "chosen_win_rate": 0.92}'
287
+ record_skip "ORPO 메트릭 파일 없음 — 게이트 건너뜀"
288
+ return 0
289
+ fi
290
+
291
+ log_info "메트릭 파일: $METRICS_FILE"
292
+
293
+ # 반복률 (더 엄격: < 5%)
294
+ REP_RATE=$(json_get "$METRICS_FILE" "rep_rate" 2>/dev/null || echo "NOT_FOUND")
295
+ if [[ "$REP_RATE" == "NOT_FOUND" ]]; then
296
+ record_skip "rep_rate 키 없음 — 건너뜀"
297
+ else
298
+ REP_PCT=$(py_value "$REP_RATE * 100")
299
+ log_info "반복률 = ${REP_PCT}% (기준: < 5%)"
300
+ if py_eval "$REP_RATE < 0.05" 2>/dev/null; then
301
+ record_pass "반복률 ${REP_PCT}% < 5%"
302
+ else
303
+ record_fail "반복률 ${REP_PCT}% >= 5% (기준 미달)"
304
+ fi
305
+ fi
306
+
307
+ # KoBEST (더 엄격: > 60%)
308
+ KOBEST=$(json_get "$METRICS_FILE" "kobest_score" 2>/dev/null || echo "NOT_FOUND")
309
+ if [[ "$KOBEST" == "NOT_FOUND" ]]; then
310
+ record_skip "kobest_score 키 없음 — 건너뜀"
311
+ else
312
+ KOBEST_PCT=$(py_value "$KOBEST * 100")
313
+ log_info "KoBEST = ${KOBEST_PCT}% (기준: > 60%)"
314
+ if py_eval "$KOBEST > 0.60" 2>/dev/null; then
315
+ record_pass "KoBEST ${KOBEST_PCT}% > 60%"
316
+ else
317
+ record_fail "KoBEST ${KOBEST_PCT}% <= 60% (기준 미달)"
318
+ fi
319
+ fi
320
+
321
+ # Chosen win rate (chosen log-prob > rejected log-prob 비율)
322
+ CHOSEN_WIN=$(json_get "$METRICS_FILE" "chosen_win_rate" 2>/dev/null || echo "NOT_FOUND")
323
+ if [[ "$CHOSEN_WIN" == "NOT_FOUND" ]]; then
324
+ record_skip "chosen_win_rate 키 없음 — 건너뜀"
325
+ else
326
+ WIN_PCT=$(py_value "$CHOSEN_WIN * 100")
327
+ log_info "Chosen win rate = ${WIN_PCT}% (기준: >= 90%)"
328
+ if py_eval "$CHOSEN_WIN >= 0.90" 2>/dev/null; then
329
+ record_pass "Chosen win rate ${WIN_PCT}% >= 90%"
330
+ else
331
+ record_fail "Chosen win rate ${WIN_PCT}% < 90% (기준 미달)"
332
+ fi
333
+ fi
334
+ }
335
+
336
+ # =============================================================================
337
+ # Gate 4: Deploy
338
+ # =============================================================================
339
+ gate_deploy() {
340
+ echo ""
341
+ echo "=================================================================="
342
+ echo " Gate: DEPLOY"
343
+ echo " 기준: Q4_K_M perplexity < F16 × 1.05 | Ollama 5개 프롬프트 응답"
344
+ echo "=================================================================="
345
+
346
+ local MODEL_NAME="frankenstallm-3b"
347
+ local GGUF_DIR="$PROJECT_DIR/outputs/gguf"
348
+ local F16_GGUF="$GGUF_DIR/${MODEL_NAME}-f16.gguf"
349
+ local Q4KM_GGUF="$GGUF_DIR/${MODEL_NAME}-Q4_K_M.gguf"
350
+
351
+ # --- GGUF 파일 존재 확인 ---
352
+ if [[ ! -f "$Q4KM_GGUF" ]]; then
353
+ log_warn "Q4_K_M GGUF 파일 없음: $Q4KM_GGUF"
354
+ log_warn "먼저 실행: bash scripts/convert_3b_gguf.sh"
355
+ record_skip "GGUF 파일 없음 — perplexity 게이트 건너뜀"
356
+ else
357
+ # perplexity 측정 (llama-perplexity 또는 Python fallback)
358
+ LLAMA_PPL_BIN="$PROJECT_DIR/outputs/llama.cpp/build/bin/llama-perplexity"
359
+
360
+ if [[ ! -f "$LLAMA_PPL_BIN" ]]; then
361
+ log_warn "llama-perplexity 바이너리 없음 — 빌드 시도 중 ..."
362
+ cmake --build "$PROJECT_DIR/outputs/llama.cpp/build" \
363
+ --target llama-perplexity -j "$(nproc)" &>/dev/null || true
364
+ fi
365
+
366
+ # 샘플 텍스트로 perplexity 비교
367
+ SAMPLE_TEXT="$PROJECT_DIR/outputs/gguf/ppl_sample.txt"
368
+ if [[ ! -f "$SAMPLE_TEXT" ]]; then
369
+ # 짧은 한국어 샘플 생성
370
+ cat > "$SAMPLE_TEXT" <<'SAMPLE'
371
+ 인공지능은 현대 사회에서 매우 중요한 기술로 자리잡고 있습니다.
372
+ 기계 학습과 딥러닝의 발전으로 인해 다양한 분야에서 혁신이 이루어지고 있습니다.
373
+ 자연어 처리 기술의 발전은 인간과 컴퓨터의 상호작용 방식을 근본적으로 변화시키고 있습니다.
374
+ 한국어는 교착어로서 특유의 형태론적 특성을 가지고 있어 자연어 처리에 독특한 도전을 제시합니다.
375
+ 대규모 언어 모델의 등장으로 기계 번역, 텍스트 요약, 질의응답 등의 성능이 크게 향상되었습니다.
376
+ SAMPLE
377
+ fi
378
+
379
+ if [[ -f "$LLAMA_PPL_BIN" && -f "$F16_GGUF" ]]; then
380
+ log_info "Perplexity 측정 중 (F16 vs Q4_K_M) ..."
381
+
382
+ PPL_F16=$(timeout 120 "$LLAMA_PPL_BIN" -m "$F16_GGUF" -f "$SAMPLE_TEXT" 2>&1 \
383
+ | grep -oP "Perplexity: \K[0-9.]+" | head -1 || echo "0")
384
+ PPL_Q4=$(timeout 120 "$LLAMA_PPL_BIN" -m "$Q4KM_GGUF" -f "$SAMPLE_TEXT" 2>&1 \
385
+ | grep -oP "Perplexity: \K[0-9.]+" | head -1 || echo "0")
386
+
387
+ if [[ "$PPL_F16" == "0" || "$PPL_Q4" == "0" ]]; then
388
+ record_skip "Perplexity 측정 실패 — 건너뜀"
389
+ else
390
+ THRESHOLD=$(py_value "$PPL_F16 * 1.05")
391
+ log_info "F16 PPL = $PPL_F16 | Q4_K_M PPL = $PPL_Q4 | 기준: < $THRESHOLD"
392
+ if py_eval "$PPL_Q4 < $PPL_F16 * 1.05" 2>/dev/null; then
393
+ record_pass "Q4_K_M PPL $PPL_Q4 < F16 PPL × 1.05 ($THRESHOLD)"
394
+ else
395
+ record_fail "Q4_K_M PPL $PPL_Q4 >= F16 PPL × 1.05 ($THRESHOLD)"
396
+ fi
397
+ fi
398
+ else
399
+ record_skip "llama-perplexity 또는 F16 GGUF 없음 — perplexity 게이트 건너뜀"
400
+ fi
401
+ fi
402
+
403
+ # --- Ollama 응답 테스트 ---
404
+ if ! command -v ollama &>/dev/null; then
405
+ record_skip "ollama 없음 — 응답 테스트 건너뜀"
406
+ return 0
407
+ fi
408
+
409
+ if ! ollama list 2>/dev/null | grep -q "$MODEL_NAME"; then
410
+ log_warn "Ollama에 $MODEL_NAME 모델이 등록되지 않았습니다."
411
+ log_warn "먼저 실행: bash scripts/deploy_3b_ollama.sh"
412
+ record_skip "Ollama 모델 미등록 — 응답 테스트 건너뜀"
413
+ return 0
414
+ fi
415
+
416
+ log_info "Ollama 응답 테스트 (5개 프롬프트) ..."
417
+
418
+ declare -a PROMPTS=(
419
+ "안녕하세요."
420
+ "1 더하기 1은 무엇인가요?"
421
+ "파이썬이란 무엇인가요?"
422
+ "한국의 수도는 어디인가요?"
423
+ "오늘 날씨가 좋네요."
424
+ )
425
+
426
+ local PASS=0 FAIL=0
427
+ for i in "${!PROMPTS[@]}"; do
428
+ local PROMPT="${PROMPTS[$i]}"
429
+ local NUM=$((i + 1))
430
+ if RESP=$(timeout 45 ollama run "$MODEL_NAME" "$PROMPT" 2>&1) && [[ -n "$RESP" ]]; then
431
+ log_ok " 프롬프트 $NUM 응답 OK (${#RESP}자)"
432
+ PASS=$((PASS + 1))
433
+ else
434
+ log_fail " 프롬프트 $NUM 응답 실패"
435
+ FAIL=$((FAIL + 1))
436
+ fi
437
+ done
438
+
439
+ log_info "Ollama 응답: $PASS/5 성공"
440
+ if [[ $FAIL -eq 0 ]]; then
441
+ record_pass "Ollama 5개 프롬프트 모두 응답 성공"
442
+ else
443
+ record_fail "Ollama 응답 실패 $FAIL/5"
444
+ fi
445
+ }
446
+
447
+ # =============================================================================
448
+ # 최종 요약 출력
449
+ # =============================================================================
450
+ print_summary() {
451
+ local phase="$1"
452
+ local TOTAL=$((GATE_PASS + GATE_FAIL + GATE_SKIP))
453
+ echo ""
454
+ echo "=================================================================="
455
+ echo " Quality Gate 결과: $phase"
456
+ echo " PASS: $GATE_PASS | FAIL: $GATE_FAIL | SKIP: $GATE_SKIP | TOTAL: $TOTAL"
457
+ echo "=================================================================="
458
+
459
+ if [[ $GATE_FAIL -eq 0 ]]; then
460
+ echo -e "${_GREEN} [GATE PASSED]${_NC} 모든 검증 기준 통과"
461
+ echo ""
462
+ return 0
463
+ else
464
+ echo -e "${_RED} [GATE FAILED]${_NC} ${GATE_FAIL}개 검증 기준 미달"
465
+ echo " 실패 항목을 수정한 후 다시 실행하세요."
466
+ echo ""
467
+ return 1
468
+ fi
469
+ }
470
+
471
+ # =============================================================================
472
+ # 진입점
473
+ # =============================================================================
474
+ PHASE="${1:-}"
475
+
476
+ if [[ -z "$PHASE" ]]; then
477
+ echo "Usage: bash scripts/quality_gate.sh <phase>"
478
+ echo " phase: pretrain | sft | orpo | deploy | all"
479
+ exit 2
480
+ fi
481
+
482
+ echo ""
483
+ echo "=================================================================="
484
+ echo " Quality Gate 검증 시작: $PHASE"
485
+ echo " 프로젝트: $PROJECT_DIR"
486
+ echo " 시각 : $(date '+%Y-%m-%d %H:%M:%S')"
487
+ echo "=================================================================="
488
+
489
+ case "$PHASE" in
490
+ pretrain)
491
+ gate_pretrain
492
+ print_summary "pretrain"
493
+ ;;
494
+ sft)
495
+ gate_sft
496
+ print_summary "sft"
497
+ ;;
498
+ orpo)
499
+ gate_orpo
500
+ print_summary "orpo"
501
+ ;;
502
+ deploy)
503
+ gate_deploy
504
+ print_summary "deploy"
505
+ ;;
506
+ all)
507
+ gate_pretrain
508
+ gate_sft
509
+ gate_orpo
510
+ gate_deploy
511
+ print_summary "all"
512
+ ;;
513
+ *)
514
+ echo "ERROR: 알 수 없는 phase: $PHASE"
515
+ echo "Usage: bash scripts/quality_gate.sh <pretrain|sft|orpo|deploy|all>"
516
+ exit 2
517
+ ;;
518
+ esac
source/scripts/run_eval.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Usage: bash scripts/run_eval.sh <checkpoint_dir>
3
+ # Example: bash scripts/run_eval.sh checkpoints/korean_1b_fp8_run1/checkpoint-0200000
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
7
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
8
+
9
+ CHECKPOINT="${1:?Usage: bash scripts/run_eval.sh <checkpoint_dir>}"
10
+
11
+ echo "=== Perplexity Evaluation ==="
12
+ python "$PROJECT_DIR/eval/perplexity.py" \
13
+ --checkpoint "$CHECKPOINT" \
14
+ --data "$PROJECT_DIR/data/korean_val.bin" \
15
+ --device cuda:0
16
+
17
+ echo ""
18
+ echo "=== Text Generation ==="
19
+ python "$PROJECT_DIR/eval/generate.py" \
20
+ --checkpoint "$CHECKPOINT" \
21
+ --prompt "안녕하세요, 저는" \
22
+ --max_new_tokens 200 \
23
+ --device cuda:0
source/scripts/run_eval_full.sh ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # ============================================================
3
+ # run_eval_full.sh — 전체 한국어 벤치마크 평가 (목표: 1.5-3시간)
4
+ #
5
+ # 사용법:
6
+ # bash scripts/run_eval_full.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
7
+ #
8
+ # 예시:
9
+ # bash scripts/run_eval_full.sh \
10
+ # checkpoints/korean_1b_sft/checkpoint-0005000 \
11
+ # eval/outputs/full_5000
12
+ #
13
+ # 태스크:
14
+ # - KoBEST (5): boolq, copa, hellaswag, sentineg, wic
15
+ # - HAE-RAE Bench (5): general_knowledge, history, loan_word, rare_word, standard_nomenclature
16
+ # - Global MMLU Korean: 57개 도메인
17
+ # - PAWS-Ko: 패러프레이즈 탐지
18
+ # - KorMedMCQA: 한국어 의학 MCQ (선택)
19
+ #
20
+ # 총 예상 샘플: ~15,000개
21
+ # 1B 모델 @ 8×B200 기준: 약 1.5-3시간
22
+ # ============================================================
23
+ set -euo pipefail
24
+
25
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
26
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
27
+
28
+ # ─── 인자 처리 ────────────────────────────────────────────
29
+ CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
30
+ TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
31
+ OUTPUT_DIR="${2:-eval/outputs/full_${TIMESTAMP}}"
32
+
33
+ [[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
34
+ [[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
35
+
36
+ # ─── 설정 ────────────────────────────────────────────────
37
+ HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
38
+ TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
39
+
40
+ # GPU 설정: 단일 GPU 또는 tensor parallel
41
+ # lm-eval의 hf backend는 기본 단일 GPU 사용
42
+ # 멀티 GPU: --model_args "pretrained=...,parallelize=True" (자동 device_map)
43
+ USE_MULTI_GPU="${USE_MULTI_GPU:-0}"
44
+ if [ "$USE_MULTI_GPU" = "1" ]; then
45
+ MODEL_EXTRA_ARGS=",parallelize=True"
46
+ echo "▶ 멀티 GPU 모드 활성화 (device_map=auto)"
47
+ else
48
+ MODEL_EXTRA_ARGS=""
49
+ CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
50
+ fi
51
+
52
+ BATCH_SIZE="${BATCH_SIZE:-auto}"
53
+ NUM_FEWSHOT="${NUM_FEWSHOT:-0}"
54
+
55
+ # ─── 태스크 정의 ─────────────────────────────────────────
56
+ # Core Korean tasks (항상 실행)
57
+ TASKS_CORE="kobest,haerae,paws_ko"
58
+
59
+ # Extended tasks (시간 있을 때)
60
+ TASKS_EXTENDED="global_mmlu_ko"
61
+
62
+ # 선택적 태스크
63
+ TASKS_OPTIONAL="kormedmcqa" # 한국어 의학 MCQ
64
+
65
+ # 전체 실행 태스크
66
+ TASKS="${TASKS_CORE},${TASKS_EXTENDED}"
67
+
68
+ # ─── 의존성 확인 ─────────────────────────────────────────
69
+ check_dep() {
70
+ python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
71
+ }
72
+ check_dep lm_eval lm-eval
73
+ check_dep transformers transformers
74
+ check_dep safetensors safetensors
75
+
76
+ echo "=================================================="
77
+ echo " Ko-LLM Full Benchmark Evaluation"
78
+ echo "=================================================="
79
+ echo " Checkpoint : $CHECKPOINT"
80
+ echo " HF output : $HF_MODEL_DIR"
81
+ echo " Tasks : $TASKS"
82
+ echo " Few-shot : $NUM_FEWSHOT"
83
+ echo " Batch size : $BATCH_SIZE"
84
+ echo " Output : $OUTPUT_DIR"
85
+ echo " Multi-GPU : $USE_MULTI_GPU"
86
+ echo " Start time : $(date)"
87
+ echo "=================================================="
88
+
89
+ mkdir -p "$OUTPUT_DIR"
90
+ LOG_FILE="$OUTPUT_DIR/eval_full.log"
91
+
92
+ # ─── Step 1: HF 포맷 변환 ───────────────────────────────
93
+ echo ""
94
+ echo "▶ [1/3] 커스텀 체크포인트 → HF 포맷 변환..."
95
+
96
+ if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
97
+ python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
98
+ --checkpoint "$CHECKPOINT" \
99
+ --output "$HF_MODEL_DIR" \
100
+ --tokenizer "$TOKENIZER" \
101
+ 2>&1 | tee -a "$LOG_FILE"
102
+ echo "✅ HF 변환 완료: $HF_MODEL_DIR"
103
+ else
104
+ echo " ↳ HF 모델 이미 존재, 변환 스킵: $HF_MODEL_DIR"
105
+ fi
106
+
107
+ # ─── Step 2: 전체 평가 ──────────────────────────────────
108
+ echo ""
109
+ echo "▶ [2/3] lm-eval 전체 평가 시작..."
110
+ echo " ↳ 로그: $LOG_FILE"
111
+ START_TIME=$(date +%s)
112
+
113
+ if [ "$USE_MULTI_GPU" = "1" ]; then
114
+ python3 -m lm_eval \
115
+ --model hf \
116
+ --model_args "pretrained=$HF_MODEL_DIR,dtype=float16,parallelize=True" \
117
+ --tasks "$TASKS" \
118
+ --num_fewshot "$NUM_FEWSHOT" \
119
+ --batch_size "$BATCH_SIZE" \
120
+ --output_path "$OUTPUT_DIR" \
121
+ --log_samples \
122
+ --verbosity INFO \
123
+ 2>&1 | tee -a "$LOG_FILE"
124
+ else
125
+ CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" python3 -m lm_eval \
126
+ --model hf \
127
+ --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
128
+ --tasks "$TASKS" \
129
+ --num_fewshot "$NUM_FEWSHOT" \
130
+ --batch_size "$BATCH_SIZE" \
131
+ --output_path "$OUTPUT_DIR" \
132
+ --log_samples \
133
+ --verbosity INFO \
134
+ 2>&1 | tee -a "$LOG_FILE"
135
+ fi
136
+
137
+ END_TIME=$(date +%s)
138
+ ELAPSED=$(( END_TIME - START_TIME ))
139
+ echo ""
140
+ echo "✅ 평가 완료! 소요: $((ELAPSED/60))분 $((ELAPSED%60))초"
141
+
142
+ # ─── Step 3: 결과 요약 리포트 생성 ─────────────────────
143
+ echo ""
144
+ echo "▶ [3/3] 결과 리포트 생성..."
145
+
146
+ python3 - "$OUTPUT_DIR" "$CHECKPOINT" <<'PYEOF'
147
+ import json, glob, sys, os
148
+ from datetime import datetime
149
+
150
+ output_dir = sys.argv[1]
151
+ checkpoint = sys.argv[2] if len(sys.argv) > 2 else "unknown"
152
+
153
+ results_files = sorted(glob.glob(f"{output_dir}/**/*.json", recursive=True))
154
+ results_files = [f for f in results_files if "samples_" not in os.path.basename(f)]
155
+
156
+ report_lines = [
157
+ f"# Ko-LLM Full Eval Report",
158
+ f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
159
+ f"Checkpoint: {checkpoint}",
160
+ "",
161
+ ]
162
+
163
+ all_results = {}
164
+ for rf in results_files:
165
+ try:
166
+ with open(rf) as f:
167
+ data = json.load(f)
168
+ results = data.get("results", {})
169
+ if results:
170
+ all_results.update(results)
171
+ except Exception:
172
+ pass
173
+
174
+ # KoBEST 요약
175
+ kobest_tasks = [k for k in all_results if k.startswith("kobest_")]
176
+ if kobest_tasks:
177
+ report_lines.append("## KoBEST")
178
+ report_lines.append("| Task | Metric | Score |")
179
+ report_lines.append("|------|--------|-------|")
180
+ for task in sorted(kobest_tasks):
181
+ metrics = all_results[task]
182
+ for key, val in metrics.items():
183
+ if "stderr" not in key and isinstance(val, (int, float)):
184
+ report_lines.append(f"| {task} | {key} | {val:.4f} |")
185
+
186
+ # HAE-RAE 요약
187
+ haerae_tasks = [k for k in all_results if k.startswith("haerae")]
188
+ if haerae_tasks:
189
+ report_lines.append("\n## HAE-RAE Bench")
190
+ report_lines.append("| Task | Metric | Score |")
191
+ report_lines.append("|------|--------|-------|")
192
+ for task in sorted(haerae_tasks):
193
+ metrics = all_results[task]
194
+ for key, val in metrics.items():
195
+ if "stderr" not in key and isinstance(val, (int, float)):
196
+ report_lines.append(f"| {task} | {key} | {val:.4f} |")
197
+
198
+ # MMLU Ko 요약 (상위 레벨만)
199
+ mmlu_top = {k: v for k, v in all_results.items()
200
+ if k.startswith("global_mmlu_ko") and "_" not in k.replace("global_mmlu_ko", "")}
201
+ if mmlu_top:
202
+ report_lines.append("\n## Global MMLU (Korean)")
203
+ for task, metrics in mmlu_top.items():
204
+ for key, val in metrics.items():
205
+ if "stderr" not in key and isinstance(val, (int, float)):
206
+ report_lines.append(f"- {task} {key}: {val:.4f}")
207
+
208
+ # 기타
209
+ other_tasks = [k for k in all_results
210
+ if not k.startswith("kobest_")
211
+ and not k.startswith("haerae")
212
+ and not k.startswith("global_mmlu_ko")]
213
+ if other_tasks:
214
+ report_lines.append("\n## 기타 태스크")
215
+ for task in sorted(other_tasks):
216
+ metrics = all_results[task]
217
+ for key, val in metrics.items():
218
+ if "stderr" not in key and isinstance(val, (int, float)):
219
+ report_lines.append(f"- {task} | {key}: {val:.4f}")
220
+
221
+ report_path = os.path.join(output_dir, "SUMMARY.md")
222
+ with open(report_path, "w") as f:
223
+ f.write("\n".join(report_lines))
224
+
225
+ print("\n".join(report_lines))
226
+ print(f"\n📄 리포트 저장: {report_path}")
227
+ PYEOF
228
+
229
+ echo ""
230
+ echo "=================================================="
231
+ echo "✅ 전체 평가 완료!"
232
+ echo " 결과 디렉토리: $OUTPUT_DIR"
233
+ echo " 요약 리포트 : $OUTPUT_DIR/SUMMARY.md"
234
+ echo " 전체 로그 : $LOG_FILE"
235
+ echo " 완료 시각 : $(date)"
236
+ echo "=================================================="
source/scripts/run_eval_quick.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # ============================================================
3
+ # run_eval_quick.sh — 빠른 평가 체크 (목표: 20-30분)
4
+ #
5
+ # 사용법:
6
+ # bash scripts/run_eval_quick.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
7
+ #
8
+ # 예시:
9
+ # bash scripts/run_eval_quick.sh \
10
+ # checkpoints/korean_1b_sft/checkpoint-0005000 \
11
+ # eval/outputs/quick_5000
12
+ #
13
+ # 태스크: kobest_boolq, kobest_copa, haerae_general_knowledge,
14
+ # haerae_history, paws_ko
15
+ # ============================================================
16
+ set -euo pipefail
17
+
18
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
19
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
20
+
21
+ # ─── 인자 처리 ────────────────────────────────────────────
22
+ CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
23
+ TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
24
+ OUTPUT_DIR="${2:-eval/outputs/quick_${TIMESTAMP}}"
25
+
26
+ # 상대 경로 → 절대 경로
27
+ [[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
28
+ [[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
29
+
30
+ # ─── 설정 ────────────────────────────────────────────────
31
+ HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
32
+ TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
33
+ DEVICE="${CUDA_VISIBLE_DEVICES:-0}" # 기본: GPU 0번만 사용
34
+ BATCH_SIZE="auto"
35
+
36
+ # 빠른 체크 태스크 (약 2,000 샘플, ~20분)
37
+ TASKS="kobest_boolq,kobest_copa,haerae_general_knowledge,haerae_history,paws_ko"
38
+
39
+ # ─── 의존성 확인 ─────────────────────────────────────────
40
+ check_dep() {
41
+ python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
42
+ }
43
+ check_dep lm_eval lm-eval
44
+ check_dep transformers transformers
45
+ check_dep safetensors safetensors
46
+
47
+ echo "=================================================="
48
+ echo " Ko-LLM Quick Eval"
49
+ echo "=================================================="
50
+ echo " Checkpoint : $CHECKPOINT"
51
+ echo " HF output : $HF_MODEL_DIR"
52
+ echo " Tasks : $TASKS"
53
+ echo " Output : $OUTPUT_DIR"
54
+ echo " Device : cuda:$DEVICE"
55
+ echo "=================================================="
56
+
57
+ mkdir -p "$OUTPUT_DIR"
58
+
59
+ # ─── Step 1: HF 포맷 변환 ───────────────────────────────
60
+ if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
61
+ echo ""
62
+ echo "▶ Step 1: 커스텀 체크포인트 → HF 포맷 변환..."
63
+ python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
64
+ --checkpoint "$CHECKPOINT" \
65
+ --output "$HF_MODEL_DIR" \
66
+ --tokenizer "$TOKENIZER"
67
+ echo "✅ HF 변환 완료: $HF_MODEL_DIR"
68
+ else
69
+ echo "▶ Step 1: HF 모델 이미 존재, 변환 스킵"
70
+ echo " $HF_MODEL_DIR"
71
+ fi
72
+
73
+ # ─── Step 2: lm-eval 실행 ───────────────────────────────
74
+ echo ""
75
+ echo "▶ Step 2: lm-eval 평가 시작..."
76
+ START_TIME=$(date +%s)
77
+
78
+ CUDA_VISIBLE_DEVICES="$DEVICE" python3 -m lm_eval \
79
+ --model hf \
80
+ --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
81
+ --tasks "$TASKS" \
82
+ --num_fewshot 0 \
83
+ --batch_size "$BATCH_SIZE" \
84
+ --output_path "$OUTPUT_DIR" \
85
+ --log_samples \
86
+ --verbosity INFO \
87
+ 2>&1 | tee "$OUTPUT_DIR/eval.log"
88
+
89
+ END_TIME=$(date +%s)
90
+ ELAPSED=$(( END_TIME - START_TIME ))
91
+
92
+ echo ""
93
+ echo "=================================================="
94
+ echo "✅ 평가 완료!"
95
+ echo " 소요시간: $((ELAPSED / 60))분 $((ELAPSED % 60))초"
96
+ echo " 결과 저장: $OUTPUT_DIR"
97
+ echo "=================================================="
98
+
99
+ # ─── Step 3: 결과 요약 출력 ─────────────────────────────
100
+ echo ""
101
+ echo "▶ Step 3: 결과 요약"
102
+ python3 - <<'PYEOF'
103
+ import json, glob, sys, os
104
+
105
+ output_dir = sys.argv[1] if len(sys.argv) > 1 else "."
106
+ results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True)
107
+ results_files = [f for f in results_files if "results" in f.lower()]
108
+
109
+ if not results_files:
110
+ print("결과 JSON 파일 없음. eval.log 확인하세요.")
111
+ sys.exit(0)
112
+
113
+ for rf in results_files:
114
+ try:
115
+ with open(rf) as f:
116
+ data = json.load(f)
117
+ results = data.get("results", {})
118
+ print(f"\n{'='*50}")
119
+ print(f"Task Results (from {os.path.basename(rf)})")
120
+ print(f"{'='*50}")
121
+ for task, metrics in results.items():
122
+ print(f"\n{task}:")
123
+ for key, val in metrics.items():
124
+ if "stderr" not in key and isinstance(val, (int, float)):
125
+ print(f" {key}: {val:.4f}")
126
+ except Exception as e:
127
+ print(f"파싱 실패: {rf}: {e}")
128
+ PYEOF
129
+ python3 - "$OUTPUT_DIR" <<'PYEOF'
130
+ import json, glob, sys, os
131
+ output_dir = sys.argv[1] if len(sys.argv) > 1 else "."
132
+ results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True)
133
+ results_files = [f for f in results_files if "results" in os.path.basename(f)]
134
+ if not results_files:
135
+ # try finding any json
136
+ results_files = glob.glob(f"{output_dir}/*.json")
137
+ for rf in results_files[:3]:
138
+ try:
139
+ with open(rf) as f:
140
+ data = json.load(f)
141
+ results = data.get("results", {})
142
+ print(f"\n{'='*50}\nTask Results: {os.path.basename(rf)}\n{'='*50}")
143
+ for task, metrics in results.items():
144
+ print(f"\n{task}:")
145
+ for key, val in metrics.items():
146
+ if "stderr" not in key and isinstance(val, (int, float)):
147
+ print(f" {key}: {val:.4f}")
148
+ except Exception as e:
149
+ print(f"파싱 실패: {rf}: {e}")
150
+ PYEOF
source/scripts/run_pretrain.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Usage: bash scripts/run_pretrain.sh [additional torchrun args]
3
+ # Runs 8-GPU DDP pretraining via torchrun.
4
+ #
5
+ # Any extra arguments are forwarded verbatim to pretrain.py.
6
+ # Examples:
7
+ # bash scripts/run_pretrain.sh --max_steps 200000
8
+ # bash scripts/run_pretrain.sh --resume checkpoints/checkpoint-0010000
9
+
10
+ set -euo pipefail
11
+
12
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
13
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
14
+
15
+ torchrun \
16
+ --nproc_per_node=8 \
17
+ --master_port=29500 \
18
+ "$PROJECT_DIR/train/pretrain.py" \
19
+ --config "$PROJECT_DIR/configs/small.yaml" \
20
+ --train_data "$PROJECT_DIR/data/train.bin" \
21
+ --val_data "$PROJECT_DIR/data/val.bin" \
22
+ --checkpoint_dir "$PROJECT_DIR/checkpoints" \
23
+ --batch_size 8 \
24
+ --grad_accum 4 \
25
+ --warmup_steps 2000 \
26
+ "$@"
source/scripts/start-gateway.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # start-gateway.sh — OpenClaw 게이트웨이 직접 시작 (독립 프로세스)
3
+ set -euo pipefail
4
+
5
+ RNTIER_HOME="${RNTIER_HOME:-$HOME}"
6
+ export PATH="${RNTIER_HOME}/.npm-global/bin:/usr/bin:/usr/local/bin:/bin:$PATH"
7
+ export HOME="${HOME:-/home/ghong}"
8
+ export OPENCLAW_STATE_DIR="${RNTIER_HOME}/.openclaw"
9
+ export OPENCLAW_CONFIG_PATH="${RNTIER_HOME}/.openclaw/openclaw.json"
10
+
11
+ LOG_DIR="/tmp/openclaw"
12
+ GATEWAY_LOG="${LOG_DIR}/gateway.log"
13
+ PID_FILE="/tmp/openclaw-gateway.pid"
14
+
15
+ mkdir -p "$LOG_DIR"
16
+
17
+ # 기존 프로세스 정리
18
+ pkill -f "openclaw.*gateway" 2>/dev/null || true
19
+ sleep 2
20
+
21
+ # 게이트웨이 시작 — setsid로 완전 분리
22
+ setsid nohup "${RNTIER_HOME}/.npm-global/bin/openclaw" gateway run \
23
+ --port 18789 \
24
+ --bind loopback \
25
+ >> "$GATEWAY_LOG" 2>&1 < /dev/null &
26
+
27
+ PID=$!
28
+ echo "$PID" > "$PID_FILE"
29
+ date +%s > /tmp/openclaw-last-restart
30
+
31
+ echo "[$(date)] Gateway launched with PID $PID"
32
+
33
+ # 10초 대기 후 상태 확인
34
+ sleep 10
35
+
36
+ if kill -0 "$PID" 2>/dev/null; then
37
+ echo "[$(date)] OK: Gateway PID $PID is alive"
38
+ ss -tlnH "sport = :18789" 2>/dev/null && echo "[$(date)] OK: Port 18789 is listening" || echo "[$(date)] WARN: Port 18789 not yet listening"
39
+ else
40
+ echo "[$(date)] FAIL: Gateway PID $PID died"
41
+ echo "--- Last 20 lines of gateway.log ---"
42
+ tail -20 "$GATEWAY_LOG" 2>/dev/null
43
+ exit 1
44
+ fi
source/scripts/telegram_notify.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Standalone Telegram notification helper for FRANKENSTALLM 3B training.
4
+
5
+ Usage:
6
+ python3 scripts/telegram_notify.py "Your message here"
7
+ python3 scripts/telegram_notify.py "<b>Bold</b> message" --parse-mode HTML
8
+
9
+ Function API:
10
+ from scripts.telegram_notify import send_telegram
11
+ send_telegram("message text")
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import json
17
+ import urllib.request
18
+ import urllib.parse
19
+ import urllib.error
20
+ import logging
21
+ from typing import Optional
22
+
23
+ # ─── Configuration ────────────────────────────────────────────────────────────
24
+ BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
25
+ CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
26
+ TIMEOUT = 15 # seconds
27
+ MAX_MSG_LEN = 4096 # Telegram limit
28
+
29
+ logging.basicConfig(
30
+ level=logging.WARNING,
31
+ format="%(asctime)s [telegram_notify] %(levelname)s: %(message)s",
32
+ )
33
+ log = logging.getLogger("telegram_notify")
34
+
35
+
36
+ def send_telegram(
37
+ message: str,
38
+ parse_mode: str = "HTML",
39
+ token: str = BOT_TOKEN,
40
+ chat_id: str = CHAT_ID,
41
+ disable_web_page_preview: bool = True,
42
+ ) -> bool:
43
+ """
44
+ Send a Telegram message via Bot API using urllib (curl-free).
45
+
46
+ Args:
47
+ message: Text to send (HTML or Markdown depending on parse_mode).
48
+ parse_mode: "HTML" or "Markdown" or "" (plain).
49
+ token: Bot token (defaults to module-level BOT_TOKEN).
50
+ chat_id: Recipient chat/channel ID.
51
+ disable_web_page_preview: Suppress link previews.
52
+
53
+ Returns:
54
+ True on success, False on any error.
55
+ """
56
+ if not message:
57
+ log.warning("Empty message — skipping send.")
58
+ return False
59
+
60
+ # Truncate if over Telegram limit, with notice
61
+ if len(message) > MAX_MSG_LEN:
62
+ truncated_notice = "\n\n<i>[message truncated]</i>" if parse_mode == "HTML" else "\n\n[message truncated]"
63
+ message = message[: MAX_MSG_LEN - len(truncated_notice)] + truncated_notice
64
+
65
+ url = f"https://api.telegram.org/bot{token}/sendMessage"
66
+
67
+ payload: dict = {
68
+ "chat_id": chat_id,
69
+ "text": message,
70
+ "disable_web_page_preview": disable_web_page_preview,
71
+ }
72
+ if parse_mode:
73
+ payload["parse_mode"] = parse_mode
74
+
75
+ data = urllib.parse.urlencode(payload).encode("utf-8")
76
+
77
+ try:
78
+ req = urllib.request.Request(
79
+ url,
80
+ data=data,
81
+ method="POST",
82
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
83
+ )
84
+ with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
85
+ body = resp.read().decode("utf-8")
86
+ result = json.loads(body)
87
+ if result.get("ok"):
88
+ return True
89
+ else:
90
+ log.error("Telegram API error: %s", result.get("description", result))
91
+ return False
92
+
93
+ except urllib.error.HTTPError as e:
94
+ try:
95
+ err_body = e.read().decode("utf-8")
96
+ except Exception:
97
+ err_body = str(e)
98
+ log.error("HTTP %d from Telegram: %s", e.code, err_body)
99
+ return False
100
+
101
+ except urllib.error.URLError as e:
102
+ log.error("Network error sending Telegram message: %s", e.reason)
103
+ return False
104
+
105
+ except json.JSONDecodeError as e:
106
+ log.error("Failed to parse Telegram response: %s", e)
107
+ return False
108
+
109
+ except Exception as e: # noqa: BLE001
110
+ log.error("Unexpected error in send_telegram: %s", e)
111
+ return False
112
+
113
+
114
+ def send_telegram_safe(message: str, **kwargs) -> bool:
115
+ """
116
+ Wrapper that catches ALL exceptions — guaranteed never to crash the caller.
117
+ Suitable for embedding in training loops where stability is critical.
118
+ """
119
+ try:
120
+ return send_telegram(message, **kwargs)
121
+ except Exception as e: # noqa: BLE001
122
+ log.error("send_telegram_safe caught unhandled exception: %s", e)
123
+ return False
124
+
125
+
126
+ # ─── CLI entry point ──────────────────────────────────────────────────────────
127
+ if __name__ == "__main__":
128
+ import argparse
129
+
130
+ parser = argparse.ArgumentParser(
131
+ description="Send a Telegram message from the command line."
132
+ )
133
+ parser.add_argument("message", nargs="?", help="Message text to send")
134
+ parser.add_argument(
135
+ "--parse-mode",
136
+ default="HTML",
137
+ choices=["HTML", "Markdown", "MarkdownV2", ""],
138
+ help="Telegram parse_mode (default: HTML)",
139
+ )
140
+ parser.add_argument(
141
+ "--token", default=BOT_TOKEN, help="Override bot token"
142
+ )
143
+ parser.add_argument(
144
+ "--chat-id", default=CHAT_ID, help="Override chat ID"
145
+ )
146
+ args = parser.parse_args()
147
+
148
+ # Allow piped stdin if no positional arg given
149
+ if args.message is None:
150
+ if not sys.stdin.isatty():
151
+ args.message = sys.stdin.read().strip()
152
+ else:
153
+ parser.print_help()
154
+ sys.exit(1)
155
+
156
+ ok = send_telegram(
157
+ args.message,
158
+ parse_mode=args.parse_mode,
159
+ token=args.token,
160
+ chat_id=args.chat_id,
161
+ )
162
+
163
+ if ok:
164
+ print("Telegram message sent successfully.")
165
+ sys.exit(0)
166
+ else:
167
+ print("ERROR: Failed to send Telegram message.", file=sys.stderr)
168
+ sys.exit(1)
source/scripts/test_ollama_repetition.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ test_ollama_repetition.py — Ollama 배포 모델 반복률 검증
4
+
5
+ ORPO eval과 동일한 프롬프트로 Ollama API 호출 후 n-gram 반복률 + EOS 종료율 측정.
6
+ 목표: 3-gram rep < 3% (한국어 자연 반복 고려), EOS 종료율 > 95%
7
+
8
+ Usage:
9
+ python scripts/test_ollama_repetition.py [--model frankenstallm-3b] [--host localhost:11434]
10
+ """
11
+ import argparse
12
+ import json
13
+ import urllib.request
14
+ import urllib.error
15
+ import sys
16
+ from collections import Counter
17
+
18
+ # ORPO eval에서 사용한 15개 한국어 프롬프트
19
+ TEST_PROMPTS = [
20
+ "대한민국의 수도는 어디인가요?",
21
+ "인공지능이란 무엇인가요?",
22
+ "한국의 전통 음식 중에서 김치에 대해 설명해주세요.",
23
+ "프로그래밍을 배우려면 어떻게 해야 하나요?",
24
+ "지구 온난화의 원인과 대책에 대해 설명해주세요.",
25
+ "한국어의 특징을 3가지 설명해주세요.",
26
+ "좋은 리더의 자질에 대해 논해주세요.",
27
+ "우주 탐사의 의미와 중요성을 설명해주세요.",
28
+ "건강한 생활 습관 5가지를 추천해주세요.",
29
+ "인터넷이 현대 사회에 미친 영향을 분석해주세요.",
30
+ "한국의 교육 제도의 장단점을 설명해주세요.",
31
+ "환경 보호를 위해 개인이 할 수 있는 일을 알려주세요.",
32
+ "4차 산업혁명이 일자리에 미치는 영향을 분석해주세요.",
33
+ "독서의 중요성과 효과적인 독서 방법을 알려주세요.",
34
+ "한국 문화의 세계화에 대해 어떻게 생각하시나요?",
35
+ ]
36
+
37
+
38
+ def compute_ngram_repetition(text: str, n: int) -> float:
39
+ """n-gram 반복률 계산 (0.0 ~ 1.0)"""
40
+ tokens = text.split()
41
+ if len(tokens) < n:
42
+ return 0.0
43
+ ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
44
+ if not ngrams:
45
+ return 0.0
46
+ counts = Counter(ngrams)
47
+ repeated = sum(c - 1 for c in counts.values() if c > 1)
48
+ return repeated / len(ngrams)
49
+
50
+
51
+ def call_ollama(prompt: str, model: str, host: str, timeout: int = 120) -> dict:
52
+ """Ollama API 호출"""
53
+ url = f"http://{host}/api/generate"
54
+ payload = json.dumps({
55
+ "model": model,
56
+ "prompt": prompt,
57
+ "stream": False,
58
+ }).encode("utf-8")
59
+
60
+ req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
61
+ try:
62
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
63
+ return json.loads(resp.read().decode("utf-8"))
64
+ except urllib.error.URLError as e:
65
+ return {"error": str(e), "response": ""}
66
+ except Exception as e:
67
+ return {"error": str(e), "response": ""}
68
+
69
+
70
+ def main():
71
+ parser = argparse.ArgumentParser(description="Ollama 반복률 검증")
72
+ parser.add_argument("--model", default="frankenstallm-3b", help="Ollama 모델 이름")
73
+ parser.add_argument("--host", default="localhost:11434", help="Ollama 서버 주소")
74
+ args = parser.parse_args()
75
+
76
+ print(f"{'='*70}")
77
+ print(f" Ollama 반복률 검증: {args.model}")
78
+ print(f" 서버: {args.host}")
79
+ print(f" 프롬프트: {len(TEST_PROMPTS)}개")
80
+ print(f"{'='*70}\n")
81
+
82
+ results = []
83
+ for i, prompt in enumerate(TEST_PROMPTS, 1):
84
+ print(f"[{i:2d}/{len(TEST_PROMPTS)}] {prompt[:40]}...")
85
+ resp = call_ollama(prompt, args.model, args.host)
86
+
87
+ if "error" in resp and resp["error"]:
88
+ print(f" ERROR: {resp['error']}")
89
+ results.append({"prompt": prompt, "error": resp["error"]})
90
+ continue
91
+
92
+ text = resp.get("response", "")
93
+ eos_done = resp.get("done", False)
94
+
95
+ rep1 = compute_ngram_repetition(text, 1)
96
+ rep2 = compute_ngram_repetition(text, 2)
97
+ rep3 = compute_ngram_repetition(text, 3)
98
+ rep4 = compute_ngram_repetition(text, 4)
99
+
100
+ results.append({
101
+ "prompt": prompt,
102
+ "response_len": len(text),
103
+ "word_count": len(text.split()),
104
+ "eos_done": eos_done,
105
+ "rep1": rep1, "rep2": rep2, "rep3": rep3, "rep4": rep4,
106
+ })
107
+
108
+ preview = text[:100].replace("\n", " ")
109
+ print(f" 응답: {preview}...")
110
+ print(f" 길이: {len(text)}자, EOS: {eos_done}, "
111
+ f"rep(1/2/3/4): {rep1:.2%}/{rep2:.2%}/{rep3:.2%}/{rep4:.2%}")
112
+ print()
113
+
114
+ # --- Summary ---
115
+ valid = [r for r in results if "error" not in r or not r.get("error")]
116
+ if not valid:
117
+ print("ERROR: 유효한 응답 없음")
118
+ sys.exit(1)
119
+
120
+ avg_rep3 = sum(r["rep3"] for r in valid) / len(valid)
121
+ eos_rate = sum(1 for r in valid if r["eos_done"]) / len(valid)
122
+ errors = len(results) - len(valid)
123
+
124
+ print(f"{'='*70}")
125
+ print(f" 결과 요약")
126
+ print(f"{'='*70}")
127
+ print(f" 유효 응답: {len(valid)}/{len(results)} (에러: {errors})")
128
+ print(f" 평균 3-gram 반복률: {avg_rep3:.2%} (목표: < 3%)")
129
+ print(f" EOS 종료율: {eos_rate:.0%} (목표: > 95%)")
130
+ print()
131
+
132
+ # Pass/Fail
133
+ # 한국어는 조사/접속사 자연 반복으로 어절 기준 3-gram rep 1.5~2%가 자연 floor
134
+ # 퇴행적 반복(30%+)과 구별하여 3% 기준 적용
135
+ rep_pass = avg_rep3 < 0.03
136
+ eos_pass = eos_rate > 0.95
137
+ overall = rep_pass and eos_pass
138
+
139
+ print(f" 3-gram 반복률: {'PASS ✓' if rep_pass else 'FAIL ✗'} ({avg_rep3:.2%})")
140
+ print(f" EOS 종료율: {'PASS ✓' if eos_pass else 'FAIL ✗'} ({eos_rate:.0%})")
141
+ print(f" 종합: {'PASS ✓' if overall else 'FAIL ✗'}")
142
+ print(f"{'='*70}")
143
+
144
+ sys.exit(0 if overall else 1)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
source/scripts/training_watchdog.sh ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # training_watchdog.sh — FRANKENSTALLM 3B Cron-based Training Watchdog
4
+ # Run: every 10 minutes via cron
5
+ # Alerts via Telegram only when problems are detected.
6
+ # =============================================================================
7
+ set -euo pipefail
8
+
9
+ # ─── Paths ───────────────────────────────────────────────────────────────────
10
+ WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
11
+ CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
12
+ LOG_FILE="$CKPT_DIR/train.log"
13
+ PID_FILE="$CKPT_DIR/train.pid"
14
+ WATCHDOG_LOG="$CKPT_DIR/watchdog.log"
15
+ STATE_FILE="$CKPT_DIR/watchdog.state" # persists last-good step/time
16
+ NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"
17
+
18
+ # ─── Thresholds ──────────────────────────────────────────────────────────────
19
+ LOSS_SPIKE_THRESHOLD="5.0" # alert if loss > this value
20
+ LOSS_NAN_PATTERN="nan|inf|NaN|Inf"
21
+ STALL_SECONDS=900 # 15 min without new log line → stalled
22
+ DISK_WARN_PCT=85 # alert if disk usage >= this %
23
+ GPU_UTIL_WARN_PCT=20 # alert if avg GPU util drops below this %
24
+ MIN_TOKPS=5000 # alert if tok/s drops below this
25
+ TOTAL_STEPS=57000
26
+ WAIT_COUNT_FILE="/tmp/frankenstallm-wait-count" # 대기 횟수 파일
27
+ MAX_WAIT_COUNT=10 # 이 횟수 초과 시 알림 후 cron 해제
28
+
29
+ # ─── Helpers ─────────────────────────────────────────────────────────────────
30
+ ts() { date '+%Y-%m-%d %H:%M:%S'; }
31
+
32
+ log_msg() {
33
+ echo "[$(ts)] $*"
34
+ }
35
+
36
+ send_alert() {
37
+ local level="$1"
38
+ local msg="$2"
39
+ log_msg "ALERT[$level]: $msg"
40
+ $NOTIFY "<b>[FRANKENSTALLM ALERT] $level</b>
41
+
42
+ $msg
43
+
44
+ <i>$(ts) | watchdog check</i>" || true
45
+ }
46
+
47
+ # ─── 1. Process alive check ──────────────────────────────────────────────────
48
+ check_process() {
49
+ if [[ ! -f "$PID_FILE" ]]; then
50
+ # 대기 모드: PID 파일 없으면 학습 미시작 상태로 카운트
51
+ local wait_count=0
52
+ [[ -f "$WAIT_COUNT_FILE" ]] && wait_count=$(cat "$WAIT_COUNT_FILE" 2>/dev/null || echo 0)
53
+ wait_count=$(( wait_count + 1 ))
54
+ echo "$wait_count" > "$WAIT_COUNT_FILE"
55
+ log_msg "Training not started yet (waiting ${wait_count}/${MAX_WAIT_COUNT})."
56
+
57
+ if (( wait_count > MAX_WAIT_COUNT )); then
58
+ send_alert "WAIT_TIMEOUT" "학습이 <b>${wait_count}회</b> 체크 동안 시작되지 않았습니다 (~$((wait_count * 10))분).
59
+
60
+ PID 파일 없음: <code>$PID_FILE</code>
61
+
62
+ Watchdog cron을 자동 해제합니다. 학습 시작 후 직접 재등록하세요:
63
+ <code>crontab -e</code>"
64
+ # cron에서 training_watchdog 제거
65
+ crontab -l 2>/dev/null | grep -v "training_watchdog" | crontab -
66
+ rm -f "$WAIT_COUNT_FILE"
67
+ log_msg "Watchdog cron entry removed after ${wait_count} waits."
68
+ fi
69
+ return 1
70
+ fi
71
+ # 학습 시작됨 → 대기 카운터 초기화
72
+ rm -f "$WAIT_COUNT_FILE"
73
+
74
+ local pid
75
+ pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
76
+
77
+ if [[ -z "$pid" ]]; then
78
+ send_alert "PROCESS" "PID file is empty: $PID_FILE"
79
+ return 1
80
+ fi
81
+
82
+ if ! kill -0 "$pid" 2>/dev/null; then
83
+ # Check if it completed normally (step == TOTAL_STEPS)
84
+ local last_step
85
+ last_step=$(grep -oP 'step\s+\K[0-9]+' "$LOG_FILE" 2>/dev/null | tail -1)
86
+ if [[ "$last_step" == "$TOTAL_STEPS" ]]; then
87
+ log_msg "Training COMPLETED at step $TOTAL_STEPS — process exit is expected."
88
+ send_alert "COMPLETE" "Training completed normally at step <code>$TOTAL_STEPS/$TOTAL_STEPS</code>."
89
+ else
90
+ send_alert "CRASH" "Training process (PID $pid) is NOT running.
91
+ Last logged step: <code>${last_step:-unknown}</code>/$TOTAL_STEPS
92
+
93
+ Check log: <code>tail -50 $LOG_FILE</code>"
94
+ fi
95
+ return 1
96
+ fi
97
+
98
+ log_msg "Process PID $pid is alive."
99
+ return 0
100
+ }
101
+
102
+ # ─── 2. Stall detection ──────────────────────────────────────────────────────
103
+ check_stall() {
104
+ if [[ ! -f "$LOG_FILE" ]]; then
105
+ send_alert "STALL" "Log file not found: $LOG_FILE"
106
+ return 1
107
+ fi
108
+
109
+ local log_mtime now elapsed
110
+ log_mtime=$(stat -c '%Y' "$LOG_FILE" 2>/dev/null || echo 0)
111
+ now=$(date +%s)
112
+ elapsed=$(( now - log_mtime ))
113
+
114
+ if (( elapsed >= STALL_SECONDS )); then
115
+ local mins=$(( elapsed / 60 ))
116
+ send_alert "STALL" "No log activity for <b>${mins} minutes</b> (threshold: $(( STALL_SECONDS/60 ))min).
117
+ Log last modified: <code>$(date -d "@$log_mtime" '+%Y-%m-%d %H:%M:%S')</code>
118
+ Training may be hung or extremely slow."
119
+ return 1
120
+ fi
121
+
122
+ log_msg "Log freshness OK: last update ${elapsed}s ago."
123
+ return 0
124
+ }
125
+
126
+ # ─── 3. Loss anomaly check ───────────────────────────────────────────────────
127
+ check_loss() {
128
+ if [[ ! -f "$LOG_FILE" ]]; then
129
+ return 0
130
+ fi
131
+
132
+ # Get last step line
133
+ local last_line
134
+ last_line=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1)
135
+
136
+ if [[ -z "$last_line" ]]; then
137
+ log_msg "No step lines found in log yet."
138
+ return 0
139
+ fi
140
+
141
+ local loss step
142
+ loss=$(echo "$last_line" | grep -oP 'loss\s+\K[0-9.eE+\-naifNIF]+' || echo "")
143
+ step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0")
144
+
145
+ if [[ -z "$loss" ]]; then
146
+ log_msg "Could not parse loss from: $last_line"
147
+ return 0
148
+ fi
149
+
150
+ # NaN/Inf check
151
+ if echo "$loss" | grep -qiE "$LOSS_NAN_PATTERN"; then
152
+ send_alert "LOSS_NAN" "Loss is <b>$loss</b> at step <code>$step</code>.
153
+ Training has diverged — NaN/Inf detected.
154
+
155
+ Last log line:
156
+ <code>${last_line}</code>"
157
+ return 1
158
+ fi
159
+
160
+ # Spike check (only after warmup, step > 500)
161
+ if (( step > 500 )); then
162
+ local loss_int
163
+ loss_int=$(echo "$loss >= $LOSS_SPIKE_THRESHOLD" | bc -l 2>/dev/null || echo 0)
164
+ if [[ "$loss_int" == "1" ]]; then
165
+ send_alert "LOSS_SPIKE" "Loss spike detected: <b>$loss</b> at step <code>$step</code> (threshold: $LOSS_SPIKE_THRESHOLD).
166
+
167
+ Last log line:
168
+ <code>${last_line}</code>"
169
+ return 1
170
+ fi
171
+ fi
172
+
173
+ log_msg "Loss OK: $loss at step $step."
174
+ return 0
175
+ }
176
+
177
+ # ─── 4. Throughput check ─────────────────────────────────────────────────────
178
+ check_throughput() {
179
+ if [[ ! -f "$LOG_FILE" ]]; then
180
+ return 0
181
+ fi
182
+
183
+ local last_line
184
+ last_line=$(grep -E 'step\s+[0-9]+.*tok/s' "$LOG_FILE" 2>/dev/null | tail -1)
185
+ [[ -z "$last_line" ]] && return 0
186
+
187
+ # tok/s may be formatted with commas: 36,321
188
+ local tokps step
189
+ tokps=$(echo "$last_line" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "")
190
+ step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0")
191
+
192
+ if [[ -z "$tokps" ]]; then
193
+ log_msg "Could not parse tok/s from last log line."
194
+ return 0
195
+ fi
196
+
197
+ if (( step > 100 && tokps < MIN_TOKPS )); then
198
+ send_alert "THROUGHPUT" "Throughput dropped to <b>${tokps} tok/s</b> at step <code>$step</code> (min: ${MIN_TOKPS}).
199
+ GPU may be throttling, NCCL stalled, or a data worker is slow."
200
+ return 1
201
+ fi
202
+
203
+ log_msg "Throughput OK: ${tokps} tok/s at step $step."
204
+ return 0
205
+ }
206
+
207
+ # ─── 5. GPU utilization check ────────────────────────────────────────────────
208
+ check_gpu() {
209
+ if ! command -v nvidia-smi &>/dev/null; then
210
+ log_msg "nvidia-smi not available — skipping GPU check."
211
+ return 0
212
+ fi
213
+
214
+ local avg_util
215
+ avg_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null \
216
+ | awk '{sum+=$1; count++} END {if(count>0) printf "%.0f", sum/count; else print 0}')
217
+
218
+ if [[ -z "$avg_util" || "$avg_util" == "0" ]]; then
219
+ log_msg "GPU util query returned 0 or empty — possibly all idle."
220
+ # Only alert if process is also running
221
+ local pid
222
+ pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
223
+ if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
224
+ send_alert "GPU_IDLE" "All 8× B200 GPUs show <b>0% utilization</b> while training process is alive.
225
+ Possible NCCL hang or data pipeline stall."
226
+ return 1
227
+ fi
228
+ return 0
229
+ fi
230
+
231
+ if (( avg_util < GPU_UTIL_WARN_PCT )); then
232
+ local gpu_details
233
+ gpu_details=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
234
+ --format=csv,noheader 2>/dev/null | head -8 || echo "unavailable")
235
+ send_alert "GPU_LOW" "Average GPU utilization: <b>${avg_util}%</b> (threshold: ${GPU_UTIL_WARN_PCT}%).
236
+
237
+ GPU details:
238
+ <code>${gpu_details}</code>"
239
+ return 1
240
+ fi
241
+
242
+ log_msg "GPU utilization OK: ${avg_util}% average."
243
+ return 0
244
+ }
245
+
246
+ # ─── 6. Disk space check ─────────────────────────────────────────────────────
247
+ check_disk() {
248
+ local usage_pct
249
+ usage_pct=$(df "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}')
250
+
251
+ if [[ -z "$usage_pct" ]]; then
252
+ log_msg "Could not determine disk usage for $CKPT_DIR."
253
+ return 0
254
+ fi
255
+
256
+ if (( usage_pct >= DISK_WARN_PCT )); then
257
+ local avail
258
+ avail=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')
259
+ send_alert "DISK" "Disk usage at <b>${usage_pct}%</b> (threshold: ${DISK_WARN_PCT}%).
260
+ Available: <b>${avail}</b> on partition containing checkpoints.
261
+
262
+ Risk: checkpoint saves may fail. Consider deleting old checkpoints."
263
+ return 1
264
+ fi
265
+
266
+ log_msg "Disk usage OK: ${usage_pct}% used."
267
+ return 0
268
+ }
269
+
270
+ # ─── Main ────────────────────────────────────────────────────────────────────
271
+ main() {
272
+ log_msg "=== Watchdog check START ==="
273
+
274
+ local issues=0
275
+
276
+ check_process || (( issues++ )) || true
277
+ check_stall || (( issues++ )) || true
278
+ check_loss || (( issues++ )) || true
279
+ check_throughput || (( issues++ )) || true
280
+ check_gpu || (( issues++ )) || true
281
+ check_disk || (( issues++ )) || true
282
+
283
+ if (( issues == 0 )); then
284
+ log_msg "All checks passed — no alerts sent."
285
+ else
286
+ log_msg "Watchdog found $issues issue(s) — alerts sent."
287
+ fi
288
+
289
+ log_msg "=== Watchdog check END ==="
290
+ }
291
+
292
+ main "$@"
source/scripts/upload_to_huggingface.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Upload FRANKENSTALLM: model, eval reports, source code, and data scripts to Hugging Face.
3
+
4
+ Usage:
5
+ huggingface-cli login
6
+
7
+ # 모델 + README + 평가 결과 + 보고서
8
+ python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --create-pr
9
+
10
+ # 위 + 소스 코드 + 데이터 스크립트 (모델/데이터/소스 전부)
11
+ python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --with-source --with-data --create-pr
12
+
13
+ # 평가·보고서만
14
+ python scripts/upload_to_huggingface.py --repo-id pathcosmos/frankenstallm --readme-only --create-pr
15
+ """
16
+
17
+ import argparse
18
+ from pathlib import Path
19
+
20
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
21
+ HF_CHECKPOINT = PROJECT_ROOT / "outputs" / "hf_checkpoint-best-fixed"
22
+ REPORTS_DIR = PROJECT_ROOT / "reports"
23
+ EVAL_RESULTS_DIR = PROJECT_ROOT / "eval" / "results" / "frankenstallm-3b-v2"
24
+ DATA_DIR = PROJECT_ROOT / "data"
25
+ SOURCE_DIRS = ["train", "model", "configs", "scripts", "tokenizer", "eval"]
26
+
27
+
28
+ def main():
29
+ parser = argparse.ArgumentParser(description="Upload model, eval reports, source, and data scripts to Hugging Face")
30
+ parser.add_argument("--repo-id", type=str, required=True, help="e.g. pathcosmos/frankenstallm")
31
+ parser.add_argument("--readme-only", action="store_true", help="Only push README + eval results + reports (no model)")
32
+ parser.add_argument("--create-pr", action="store_true", help="Create a Pull Request instead of pushing to main")
33
+ parser.add_argument("--with-source", action="store_true", help="Upload full source code (train, eval, model, configs, scripts, tokenizer)")
34
+ parser.add_argument("--with-data", action="store_true", help="Upload data scripts and DATA_README (no .bin files)")
35
+ args = parser.parse_args()
36
+ create_pr = getattr(args, "create_pr", False)
37
+
38
+ try:
39
+ from huggingface_hub import HfApi, create_repo
40
+ except ImportError:
41
+ print("Install: pip install huggingface_hub")
42
+ raise SystemExit(1)
43
+
44
+ api = HfApi()
45
+
46
+ # 레포 없으면 생성
47
+ # 레포가 없으면 생성 (본인 계정일 때만 성공)
48
+ try:
49
+ create_repo(args.repo_id, repo_type="model", exist_ok=True)
50
+ except Exception as e:
51
+ print(f"Note: create_repo skipped (use Hugging Face website to create repo if needed): {e}")
52
+
53
+ if not args.readme_only:
54
+ if not HF_CHECKPOINT.exists():
55
+ print(f"Checkpoint not found: {HF_CHECKPOINT}")
56
+ raise SystemExit(1)
57
+ print(f"Uploading model from {HF_CHECKPOINT} ...")
58
+ api.upload_folder(
59
+ folder_path=str(HF_CHECKPOINT),
60
+ repo_id=args.repo_id,
61
+ repo_type="model",
62
+ create_pr=create_pr,
63
+ )
64
+ print("Model upload done.")
65
+
66
+ # README는 체크포인트 폴더 것 사용 (이미 평가 요약 포함)
67
+ readme_src = HF_CHECKPOINT / "README.md"
68
+ if readme_src.exists():
69
+ print("Pushing README (model card) ...")
70
+ api.upload_file(
71
+ path_or_fileobj=str(readme_src),
72
+ path_in_repo="README.md",
73
+ repo_id=args.repo_id,
74
+ repo_type="model",
75
+ create_pr=create_pr,
76
+ )
77
+ print("README upload done.")
78
+ else:
79
+ print("No README.md in checkpoint dir; skipping README push.")
80
+
81
+ # 평가 결과 JSON
82
+ results_json = EVAL_RESULTS_DIR / "ollama_benchmark_results.json"
83
+ if results_json.exists():
84
+ print("Pushing ollama_benchmark_results.json ...")
85
+ api.upload_file(
86
+ path_or_fileobj=str(results_json),
87
+ path_in_repo="eval/ollama_benchmark_results.json",
88
+ repo_id=args.repo_id,
89
+ repo_type="model",
90
+ create_pr=create_pr,
91
+ )
92
+ print("Eval results upload done.")
93
+
94
+ # 배포·평가 보고서 (상세 기록)
95
+ for name, src in [
96
+ ("2026-03-09_GGUF_DEPLOYMENT_AND_EVAL_REPORT.md", REPORTS_DIR / "2026-03-09_GGUF_DEPLOYMENT_AND_EVAL_REPORT.md"),
97
+ ("2026-03-09_ORPO_EVALUATION_REPORT.md", REPORTS_DIR / "2026-03-09_ORPO_EVALUATION_REPORT.md"),
98
+ ]:
99
+ if src.exists():
100
+ print(f"Pushing {name} ...")
101
+ api.upload_file(
102
+ path_or_fileobj=str(src),
103
+ path_in_repo=f"eval_reports/{name}",
104
+ repo_id=args.repo_id,
105
+ repo_type="model",
106
+ create_pr=create_pr,
107
+ )
108
+ print("Reports upload done.")
109
+
110
+ # ---------- 소스 코드 (--with-source) ----------
111
+ if getattr(args, "with_source", False):
112
+ print("Uploading source code ...")
113
+ ignore_common = ["**/__pycache__/**", "**/*.pyc", "**/.DS_Store"]
114
+ for dirname in ["train", "model", "configs", "scripts", "tokenizer"]:
115
+ src_dir = PROJECT_ROOT / dirname
116
+ if src_dir.exists():
117
+ api.upload_folder(
118
+ folder_path=str(src_dir),
119
+ path_in_repo=f"source/{dirname}",
120
+ repo_id=args.repo_id,
121
+ repo_type="model",
122
+ ignore_patterns=ignore_common,
123
+ create_pr=create_pr,
124
+ )
125
+ print(f" source/{dirname}/ done.")
126
+ # eval: outputs, results 제외 (대용량)
127
+ eval_dir = PROJECT_ROOT / "eval"
128
+ if eval_dir.exists():
129
+ api.upload_folder(
130
+ folder_path=str(eval_dir),
131
+ path_in_repo="source/eval",
132
+ repo_id=args.repo_id,
133
+ repo_type="model",
134
+ ignore_patterns=ignore_common + ["**/outputs/**", "**/results/**", "**/.compile_cache/**"],
135
+ create_pr=create_pr,
136
+ )
137
+ print(" source/eval/ done.")
138
+ # 루트 문서
139
+ for name in ["README.md", "CLAUDE.md", "requirements.txt", "PROGRESS.md"]:
140
+ src_file = PROJECT_ROOT / name
141
+ if src_file.exists():
142
+ api.upload_file(
143
+ path_or_fileobj=str(src_file),
144
+ path_in_repo=f"source/{name}",
145
+ repo_id=args.repo_id,
146
+ repo_type="model",
147
+ create_pr=create_pr,
148
+ )
149
+ for p in PROJECT_ROOT.glob("PLAN_*.md"):
150
+ api.upload_file(
151
+ path_or_fileobj=str(p),
152
+ path_in_repo=f"source/{p.name}",
153
+ repo_id=args.repo_id,
154
+ repo_type="model",
155
+ create_pr=create_pr,
156
+ )
157
+ print("Source upload done.")
158
+
159
+ # ---------- 데이터 스크립트 (--with-data, .bin 제외) ----------
160
+ if getattr(args, "with_data", False) and DATA_DIR.exists():
161
+ print("Uploading data scripts (no .bin) ...")
162
+ api.upload_folder(
163
+ folder_path=str(DATA_DIR),
164
+ path_in_repo="data",
165
+ repo_id=args.repo_id,
166
+ repo_type="model",
167
+ ignore_patterns=[
168
+ "**/*.bin",
169
+ "**/*.chunk*",
170
+ "**/__pycache__/**",
171
+ "**/code/**",
172
+ "**/*.pyc",
173
+ ],
174
+ create_pr=create_pr,
175
+ )
176
+ print("Data scripts upload done.")
177
+
178
+ print(f"Done. https://huggingface.co/{args.repo_id}")
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()