Commit ·
09ea133
1
Parent(s): c0f89d0
Upload folder using huggingface_hub (#16)
Browse files- Upload folder using huggingface_hub (da19444cc2b94051b3ee46aa2cc8793a2ea50b72)
Co-authored-by: pathcosmos <somebody-to-love@users.noreply.huggingface.co>
- source/configs/3b_pretrain.yaml +60 -0
- source/configs/clickhouse-config.xml +315 -0
- source/configs/hybrid_3b.yaml +55 -0
- source/configs/korean_1b.yaml +41 -0
- source/configs/korean_1b_fp8.yaml +48 -0
- source/configs/korean_1b_sft.yaml +41 -0
- source/configs/korean_3b_fp8.yaml +49 -0
- source/configs/korean_3b_orpo.yaml +49 -0
- source/configs/korean_3b_sft.yaml +47 -0
- source/configs/korean_3b_sft_v2.yaml +54 -0
- source/configs/medium.yaml +30 -0
- source/configs/small.yaml +30 -0
- source/configs/small_fp8.yaml +35 -0
source/configs/3b_pretrain.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean LLM 3B parameters — FP8 (B200 TransformerEngine MXFP8)
|
| 2 |
+
#
|
| 3 |
+
# [아키텍처 근거 — 2026-02-27]
|
| 4 |
+
# - 저스티스리그 제안 기반: d_model=2560, 32L, 32H, 8KV
|
| 5 |
+
# - 파라미터: ~2.39B ("3B급" — Llama-3.2-3B 대비 경량, 한국어 64K vocab 효율)
|
| 6 |
+
# - d_ffn=6912: 2.7×d_model, 16배수 FP8 정렬
|
| 7 |
+
# - GQA 4:1 (32H:8KV) — 추론 효율 + KV cache 절약
|
| 8 |
+
# - head_dim=80 (2560/32) — Flash Attention 효율적
|
| 9 |
+
#
|
| 10 |
+
# [데이터/학습 설계]
|
| 11 |
+
# - 데이터: korean_train.bin 8.91B tokens
|
| 12 |
+
# - Chinchilla 최적: 2.4B × 20 = 48B tokens
|
| 13 |
+
# - 실제 목표: 60B tokens (6.7 에포크) — 한국어 단일 언어 특성상 추가 학습 유리
|
| 14 |
+
# - max_steps 57000 = 60B tokens / 1,048,576 tok/step
|
| 15 |
+
#
|
| 16 |
+
# [GPU 메모리 예측 — 8× B200 183GB]
|
| 17 |
+
# - 모델 FP8: 2.4 GB
|
| 18 |
+
# - Optimizer (bf16 master + fp32 mom/var): 23.9 GB
|
| 19 |
+
# - Gradient (bf16): 4.8 GB
|
| 20 |
+
# - Activation (per GPU, bs=8): ~27 GB
|
| 21 |
+
# - 합계: ~58 GB/GPU (31.7% 활용) → 여유 충분
|
| 22 |
+
#
|
| 23 |
+
# 실행: bash scripts/launch_korean_3b.sh
|
| 24 |
+
# 테스트: RUN_NAME=korean_3b_test bash scripts/launch_korean_3b.sh --max_steps 50
|
| 25 |
+
|
| 26 |
+
model:
|
| 27 |
+
vocab_size: 64000
|
| 28 |
+
d_model: 2560
|
| 29 |
+
n_layers: 32
|
| 30 |
+
n_heads: 32
|
| 31 |
+
n_kv_heads: 8 # GQA 4:1 (K/V 파라미터 75% 절감)
|
| 32 |
+
d_ffn: 6912 # 2.7×d_model, 16배수 (FP8 alignment)
|
| 33 |
+
max_seq_len: 4096
|
| 34 |
+
rope_theta: 500000.0
|
| 35 |
+
dropout: 0.0
|
| 36 |
+
bias: false
|
| 37 |
+
use_flash_attn: true
|
| 38 |
+
use_fp8: true # TransformerEngine MXFP8BlockScaling (B200 네이티브)
|
| 39 |
+
|
| 40 |
+
train:
|
| 41 |
+
# 57k steps × 1,048,576 tok/step = 59.8B tokens ≈ 6.7 에포크
|
| 42 |
+
max_steps: 57000
|
| 43 |
+
batch_size: 4 # per GPU: 4 × 4096 = 16,384 토큰 | VRAM ~130 GB (183GB의 71%)
|
| 44 |
+
grad_accum_steps: 8 # eff_batch: 4 × 8GPU × 8 × 4096 = 1,048,576 tok/step
|
| 45 |
+
lr: 1.5e-4 # 3B 규모: GPT-3 scaling 기준 1B(2e-4) → 3B(1.5e-4)
|
| 46 |
+
weight_decay: 0.1
|
| 47 |
+
warmup_steps: 2000 # 57k steps의 3.5% — 안정적 warmup
|
| 48 |
+
max_grad_norm: 1.0
|
| 49 |
+
log_interval: 10
|
| 50 |
+
save_interval: 1000 # 57k steps 기준 ~57 체크포인트
|
| 51 |
+
eval_interval: 500 # val loss 모니터링
|
| 52 |
+
use_amp: false # fp8_autocast가 대체
|
| 53 |
+
compile_model: false # TE 2.10 + DDP graph break 위험
|
| 54 |
+
fp8_amax_history_len: 16
|
| 55 |
+
fp8_amax_compute_algo: "max"
|
| 56 |
+
fp8_format: "MXFP8" # B200 Blackwell 네이티브 블록 스케일링
|
| 57 |
+
|
| 58 |
+
tokenizer:
|
| 59 |
+
vocab_size: 64000
|
| 60 |
+
type: sentencepiece_unigram
|
source/configs/clickhouse-config.xml
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0"?>
|
| 2 |
+
<!--
|
| 3 |
+
ClickHouse Server Configuration
|
| 4 |
+
=================================
|
| 5 |
+
Hardware: AMD EPYC 9365 36-Core (72 threads), 2.2 TB RAM, 2x NUMA nodes
|
| 6 |
+
Purpose : Data analytics for LLM training pipeline + factory sensor data (CRM project)
|
| 7 |
+
Generated: 2026-03-01
|
| 8 |
+
-->
|
| 9 |
+
<clickhouse>
|
| 10 |
+
|
| 11 |
+
<!-- =========================================================
|
| 12 |
+
Network / Listen
|
| 13 |
+
Listen on localhost only — GPU training node is local only
|
| 14 |
+
========================================================= -->
|
| 15 |
+
<listen_host>127.0.0.1</listen_host>
|
| 16 |
+
<http_port>8123</http_port>
|
| 17 |
+
<tcp_port>9000</tcp_port>
|
| 18 |
+
<interserver_http_port>9009</interserver_http_port>
|
| 19 |
+
|
| 20 |
+
<!-- =========================================================
|
| 21 |
+
Paths
|
| 22 |
+
Data on GPFS (20 TB, 18 TB free) for large datasets.
|
| 23 |
+
Tmp / logs on local /tmp to reduce GPFS small-file pressure.
|
| 24 |
+
========================================================= -->
|
| 25 |
+
<path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/</path>
|
| 26 |
+
<tmp_path>/tmp/clickhouse-tmp/</tmp_path>
|
| 27 |
+
|
| 28 |
+
<!-- =========================================================
|
| 29 |
+
Logging
|
| 30 |
+
========================================================= -->
|
| 31 |
+
<logger>
|
| 32 |
+
<level>information</level>
|
| 33 |
+
<log>/tmp/clickhouse/logs/clickhouse-server.log</log>
|
| 34 |
+
<errorlog>/tmp/clickhouse/logs/clickhouse-server.err.log</errorlog>
|
| 35 |
+
<!-- Rotate at 512 MB, keep 10 files -->
|
| 36 |
+
<size>536870912</size>
|
| 37 |
+
<count>10</count>
|
| 38 |
+
</logger>
|
| 39 |
+
|
| 40 |
+
<!-- =========================================================
|
| 41 |
+
Memory — server-level cap for all queries combined
|
| 42 |
+
========================================================= -->
|
| 43 |
+
<max_server_memory_usage>536870912000</max_server_memory_usage>
|
| 44 |
+
|
| 45 |
+
<!-- =========================================================
|
| 46 |
+
CPU / Thread Pools (server-level settings)
|
| 47 |
+
Physical cores: 36 per socket, 72 total (2 NUMA nodes).
|
| 48 |
+
Reserve half for GPU training → 36 threads for ClickHouse.
|
| 49 |
+
========================================================= -->
|
| 50 |
+
<background_pool_size>18</background_pool_size>
|
| 51 |
+
<background_merges_mutations_concurrency_ratio>2</background_merges_mutations_concurrency_ratio>
|
| 52 |
+
<background_move_pool_size>4</background_move_pool_size>
|
| 53 |
+
<background_fetches_pool_size>4</background_fetches_pool_size>
|
| 54 |
+
<background_schedule_pool_size>8</background_schedule_pool_size>
|
| 55 |
+
<background_common_pool_size>8</background_common_pool_size>
|
| 56 |
+
|
| 57 |
+
<!-- =========================================================
|
| 58 |
+
Concurrency
|
| 59 |
+
========================================================= -->
|
| 60 |
+
<max_concurrent_queries>100</max_concurrent_queries>
|
| 61 |
+
<max_waiting_queries>50</max_waiting_queries>
|
| 62 |
+
|
| 63 |
+
<!-- =========================================================
|
| 64 |
+
Caches
|
| 65 |
+
mark_cache : 10 GB — indexes for MergeTree parts
|
| 66 |
+
uncompressed : 20 GB — decompressed block cache
|
| 67 |
+
query_cache : 2 GB — optional query result cache
|
| 68 |
+
NUMA note: ClickHouse allocates via jemalloc with NUMA
|
| 69 |
+
awareness; no extra config needed beyond thread binding.
|
| 70 |
+
========================================================= -->
|
| 71 |
+
<mark_cache_size>10737418240</mark_cache_size>
|
| 72 |
+
<uncompressed_cache_size>21474836480</uncompressed_cache_size>
|
| 73 |
+
<!-- Available from ClickHouse 23.x -->
|
| 74 |
+
<query_cache>
|
| 75 |
+
<max_size_in_bytes>2147483648</max_size_in_bytes>
|
| 76 |
+
<max_entries>1024</max_entries>
|
| 77 |
+
<max_entry_size_in_bytes>104857600</max_entry_size_in_bytes>
|
| 78 |
+
<max_entry_size_in_rows>30000000</max_entry_size_in_rows>
|
| 79 |
+
</query_cache>
|
| 80 |
+
<!-- Compiled expression cache -->
|
| 81 |
+
<compiled_expression_cache_size>134217728</compiled_expression_cache_size>
|
| 82 |
+
<compiled_expression_cache_elements_size>10000</compiled_expression_cache_elements_size>
|
| 83 |
+
|
| 84 |
+
<!-- =========================================================
|
| 85 |
+
I/O
|
| 86 |
+
GPFS is a parallel filesystem — large sequential reads are
|
| 87 |
+
efficient; use aggressive read-ahead and prefetch.
|
| 88 |
+
NVMe local disks can be used for tmp / intermediate data.
|
| 89 |
+
========================================================= -->
|
| 90 |
+
<!-- MergeTree concurrent read settings moved to <profiles><default> -->
|
| 91 |
+
<!-- Async reads from object/POSIX storage -->
|
| 92 |
+
<asynchronous_metrics_update_period_s>60</asynchronous_metrics_update_period_s>
|
| 93 |
+
|
| 94 |
+
<!-- Async Insert settings moved to <profiles><default> below -->
|
| 95 |
+
|
| 96 |
+
<!-- =========================================================
|
| 97 |
+
MergeTree Storage Settings
|
| 98 |
+
========================================================= -->
|
| 99 |
+
<merge_tree>
|
| 100 |
+
<!-- Bytes: prefer larger parts on GPFS to reduce metadata overhead -->
|
| 101 |
+
<max_bytes_to_merge_at_max_space_in_pool>161061273600</max_bytes_to_merge_at_max_space_in_pool>
|
| 102 |
+
<!-- Allow up to 300 parts per partition before slowing inserts -->
|
| 103 |
+
<parts_to_throw_insert>300</parts_to_throw_insert>
|
| 104 |
+
<parts_to_delay_insert>150</parts_to_delay_insert>
|
| 105 |
+
<!-- Use AVX-512 SIMD for sorting / hashing where available -->
|
| 106 |
+
<use_minimalistic_part_header_in_zookeeper>1</use_minimalistic_part_header_in_zookeeper>
|
| 107 |
+
<!-- Keep deleted data up to 8 hours before final cleanup -->
|
| 108 |
+
<old_parts_lifetime>28800</old_parts_lifetime>
|
| 109 |
+
</merge_tree>
|
| 110 |
+
|
| 111 |
+
<!-- =========================================================
|
| 112 |
+
Compression
|
| 113 |
+
LZ4 default (fast, AVX2/AVX-512 accelerated).
|
| 114 |
+
ZSTD level 3 for cold / archival tables — trade CPU for space.
|
| 115 |
+
========================================================= -->
|
| 116 |
+
<compression>
|
| 117 |
+
<!-- Hot data: LZ4 -->
|
| 118 |
+
<case>
|
| 119 |
+
<min_part_size>1073741824</min_part_size>
|
| 120 |
+
<min_part_size_ratio>0.01</min_part_size_ratio>
|
| 121 |
+
<method>lz4</method>
|
| 122 |
+
</case>
|
| 123 |
+
<!-- Very large parts: ZSTD for better ratio -->
|
| 124 |
+
<case>
|
| 125 |
+
<min_part_size>10737418240</min_part_size>
|
| 126 |
+
<min_part_size_ratio>0.1</min_part_size_ratio>
|
| 127 |
+
<method>zstd</method>
|
| 128 |
+
<level>3</level>
|
| 129 |
+
</case>
|
| 130 |
+
</compression>
|
| 131 |
+
|
| 132 |
+
<!-- =========================================================
|
| 133 |
+
Users / Access Control
|
| 134 |
+
Single local user, no password (localhost-only listen).
|
| 135 |
+
See users.xml (or inline below) for quota/profile.
|
| 136 |
+
========================================================= -->
|
| 137 |
+
<users>
|
| 138 |
+
<default>
|
| 139 |
+
<password></password>
|
| 140 |
+
<networks>
|
| 141 |
+
<ip>127.0.0.1/8</ip>
|
| 142 |
+
<ip>::1</ip>
|
| 143 |
+
</networks>
|
| 144 |
+
<profile>default</profile>
|
| 145 |
+
<quota>default</quota>
|
| 146 |
+
<!-- Allow DDL from default user -->
|
| 147 |
+
<access_management>1</access_management>
|
| 148 |
+
</default>
|
| 149 |
+
</users>
|
| 150 |
+
|
| 151 |
+
<profiles>
|
| 152 |
+
<default>
|
| 153 |
+
<!-- Memory per query: 500 GB -->
|
| 154 |
+
<max_memory_usage>536870912000</max_memory_usage>
|
| 155 |
+
<max_bytes_before_external_group_by>483183820800</max_bytes_before_external_group_by>
|
| 156 |
+
<max_bytes_before_external_sort>483183820800</max_bytes_before_external_sort>
|
| 157 |
+
<!-- Threads per query: half of 72 cores -->
|
| 158 |
+
<max_threads>36</max_threads>
|
| 159 |
+
<max_concurrent_queries_for_user>100</max_concurrent_queries_for_user>
|
| 160 |
+
<use_uncompressed_cache>1</use_uncompressed_cache>
|
| 161 |
+
<!-- O_DIRECT for large scans -->
|
| 162 |
+
<min_bytes_to_use_direct_io>10737418240</min_bytes_to_use_direct_io>
|
| 163 |
+
<!-- SIMD JSON parsing (AVX-512) -->
|
| 164 |
+
<input_format_parallel_parsing>1</input_format_parallel_parsing>
|
| 165 |
+
<output_format_parallel_formatting>1</output_format_parallel_formatting>
|
| 166 |
+
<!-- Async insert for sensor/CRM streaming -->
|
| 167 |
+
<async_insert>1</async_insert>
|
| 168 |
+
<async_insert_max_data_size>33554432</async_insert_max_data_size>
|
| 169 |
+
<async_insert_busy_timeout_ms>200</async_insert_busy_timeout_ms>
|
| 170 |
+
<async_insert_deduplicate>0</async_insert_deduplicate>
|
| 171 |
+
<wait_for_async_insert>1</wait_for_async_insert>
|
| 172 |
+
<wait_for_async_insert_timeout>5</wait_for_async_insert_timeout>
|
| 173 |
+
<!-- MergeTree concurrent read -->
|
| 174 |
+
<merge_tree_min_rows_for_concurrent_read>20000</merge_tree_min_rows_for_concurrent_read>
|
| 175 |
+
<merge_tree_min_bytes_for_concurrent_read>24117248</merge_tree_min_bytes_for_concurrent_read>
|
| 176 |
+
</default>
|
| 177 |
+
</profiles>
|
| 178 |
+
|
| 179 |
+
<quotas>
|
| 180 |
+
<default>
|
| 181 |
+
<interval>
|
| 182 |
+
<duration>3600</duration>
|
| 183 |
+
<queries>0</queries>
|
| 184 |
+
<errors>0</errors>
|
| 185 |
+
<result_rows>0</result_rows>
|
| 186 |
+
<read_rows>0</read_rows>
|
| 187 |
+
<execution_time>0</execution_time>
|
| 188 |
+
</interval>
|
| 189 |
+
</default>
|
| 190 |
+
</quotas>
|
| 191 |
+
|
| 192 |
+
<!-- =========================================================
|
| 193 |
+
Distributed DDL (single-node — disable ZooKeeper dependency)
|
| 194 |
+
========================================================= -->
|
| 195 |
+
<!-- No ZooKeeper configured; replicated tables use ReplicatedMergeTree
|
| 196 |
+
only if ZK is added later. Commenting out to avoid startup warnings.
|
| 197 |
+
<zookeeper>
|
| 198 |
+
<node>
|
| 199 |
+
<host>localhost</host>
|
| 200 |
+
<port>2181</port>
|
| 201 |
+
</node>
|
| 202 |
+
</zookeeper>
|
| 203 |
+
-->
|
| 204 |
+
|
| 205 |
+
<!-- =========================================================
|
| 206 |
+
Timezone
|
| 207 |
+
========================================================= -->
|
| 208 |
+
<timezone>Asia/Seoul</timezone>
|
| 209 |
+
|
| 210 |
+
<!-- =========================================================
|
| 211 |
+
Query Log / System Tables
|
| 212 |
+
Keep 30 days of query history for pipeline debugging.
|
| 213 |
+
========================================================= -->
|
| 214 |
+
<query_log>
|
| 215 |
+
<database>system</database>
|
| 216 |
+
<table>query_log</table>
|
| 217 |
+
<partition_by>toYYYYMM(event_date)</partition_by>
|
| 218 |
+
<ttl>event_date + INTERVAL 30 DAY</ttl>
|
| 219 |
+
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
| 220 |
+
<max_size_rows>1048576</max_size_rows>
|
| 221 |
+
</query_log>
|
| 222 |
+
|
| 223 |
+
<query_thread_log>
|
| 224 |
+
<database>system</database>
|
| 225 |
+
<table>query_thread_log</table>
|
| 226 |
+
<partition_by>toYYYYMM(event_date)</partition_by>
|
| 227 |
+
<ttl>event_date + INTERVAL 7 DAY</ttl>
|
| 228 |
+
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
| 229 |
+
</query_thread_log>
|
| 230 |
+
|
| 231 |
+
<part_log>
|
| 232 |
+
<database>system</database>
|
| 233 |
+
<table>part_log</table>
|
| 234 |
+
<partition_by>toYYYYMM(event_date)</partition_by>
|
| 235 |
+
<ttl>event_date + INTERVAL 14 DAY</ttl>
|
| 236 |
+
<flush_interval_milliseconds>5000</flush_interval_milliseconds>
|
| 237 |
+
</part_log>
|
| 238 |
+
|
| 239 |
+
<trace_log>
|
| 240 |
+
<database>system</database>
|
| 241 |
+
<table>trace_log</table>
|
| 242 |
+
<partition_by>toYYYYMM(event_date)</partition_by>
|
| 243 |
+
<ttl>event_date + INTERVAL 7 DAY</ttl>
|
| 244 |
+
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
| 245 |
+
</trace_log>
|
| 246 |
+
|
| 247 |
+
<metric_log>
|
| 248 |
+
<database>system</database>
|
| 249 |
+
<table>metric_log</table>
|
| 250 |
+
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
| 251 |
+
<collect_interval_milliseconds>1000</collect_interval_milliseconds>
|
| 252 |
+
<ttl>event_date + INTERVAL 7 DAY</ttl>
|
| 253 |
+
</metric_log>
|
| 254 |
+
|
| 255 |
+
<asynchronous_metric_log>
|
| 256 |
+
<database>system</database>
|
| 257 |
+
<table>asynchronous_metric_log</table>
|
| 258 |
+
<flush_interval_milliseconds>7500</flush_interval_milliseconds>
|
| 259 |
+
<ttl>event_date + INTERVAL 7 DAY</ttl>
|
| 260 |
+
</asynchronous_metric_log>
|
| 261 |
+
|
| 262 |
+
<!-- =========================================================
|
| 263 |
+
Crash Handler
|
| 264 |
+
========================================================= -->
|
| 265 |
+
<core_dump>
|
| 266 |
+
<size_limit>0</size_limit>
|
| 267 |
+
</core_dump>
|
| 268 |
+
|
| 269 |
+
<!-- =========================================================
|
| 270 |
+
Keeper (built-in, single-node mode — replaces ZooKeeper
|
| 271 |
+
if you want ReplicatedMergeTree without external ZK).
|
| 272 |
+
Uncomment if needed.
|
| 273 |
+
========================================================= -->
|
| 274 |
+
<!--
|
| 275 |
+
<keeper_server>
|
| 276 |
+
<tcp_port>9181</tcp_port>
|
| 277 |
+
<server_id>1</server_id>
|
| 278 |
+
<log_storage_path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/keeper/logs</log_storage_path>
|
| 279 |
+
<snapshot_storage_path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/keeper/snapshots</snapshot_storage_path>
|
| 280 |
+
<coordination_settings>
|
| 281 |
+
<operation_timeout_ms>10000</operation_timeout_ms>
|
| 282 |
+
<session_timeout_ms>30000</session_timeout_ms>
|
| 283 |
+
<raft_logs_level>warning</raft_logs_level>
|
| 284 |
+
</coordination_settings>
|
| 285 |
+
<raft_configuration>
|
| 286 |
+
<server>
|
| 287 |
+
<id>1</id>
|
| 288 |
+
<hostname>localhost</hostname>
|
| 289 |
+
<port>9444</port>
|
| 290 |
+
</server>
|
| 291 |
+
</raft_configuration>
|
| 292 |
+
</keeper_server>
|
| 293 |
+
-->
|
| 294 |
+
|
| 295 |
+
<!-- =========================================================
|
| 296 |
+
AVX-512 / SIMD hints
|
| 297 |
+
ClickHouse auto-detects CPUID at runtime; these flags are
|
| 298 |
+
informational comments — no XML knobs needed.
|
| 299 |
+
Detected: avx512f, avx512bw, avx512vl, avx512_vnni, avx512_bf16
|
| 300 |
+
Used in: LZ4 compression, hash aggregation, sorting, filters.
|
| 301 |
+
========================================================= -->
|
| 302 |
+
|
| 303 |
+
<!-- =========================================================
|
| 304 |
+
Miscellaneous
|
| 305 |
+
========================================================= -->
|
| 306 |
+
<!-- Skip strict settings check for forward-compat -->
|
| 307 |
+
<skip_check_for_incorrect_settings>1</skip_check_for_incorrect_settings>
|
| 308 |
+
<!-- Graceful shutdown: wait up to 60 s for running queries -->
|
| 309 |
+
<shutdown_wait_unfinished>60</shutdown_wait_unfinished>
|
| 310 |
+
<!-- Send anonymous usage statistics: off for private server -->
|
| 311 |
+
<send_crash_reports>
|
| 312 |
+
<enabled>false</enabled>
|
| 313 |
+
</send_crash_reports>
|
| 314 |
+
|
| 315 |
+
</clickhouse>
|
source/configs/hybrid_3b.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer
|
| 2 |
+
#
|
| 3 |
+
# [설계 근거 — 2026-03-05]
|
| 4 |
+
# - 아키텍처: Nemotron-H 8B Dense 참고, 3B 스케일 적용
|
| 5 |
+
# - 40 layers: 37 Mamba-2 + 3 Attention (layer 13, 26, 39)
|
| 6 |
+
# - 파라미터: ~2.9B (embedding 포함)
|
| 7 |
+
# - 데이터: 3b_train.bin (기존 Pure Transformer 동일 데이터)
|
| 8 |
+
# - lr=2e-4: Mamba-2 논문 참고, Transformer보다 약간 높음
|
| 9 |
+
# - Attention 3개: 초반(13), 중반(26), 후반(39) 균등 배치
|
| 10 |
+
# - Mamba 장점: O(n) 시퀀스 처리, 추론 시 constant memory
|
| 11 |
+
#
|
| 12 |
+
# 실행: bash scripts/launch_hybrid_3b.sh
|
| 13 |
+
model:
|
| 14 |
+
vocab_size: 64000
|
| 15 |
+
d_model: 3072
|
| 16 |
+
n_layers: 40
|
| 17 |
+
n_heads: 24
|
| 18 |
+
n_kv_heads: 8
|
| 19 |
+
d_ffn: 9216
|
| 20 |
+
max_seq_len: 4096
|
| 21 |
+
rope_theta: 500000.0
|
| 22 |
+
dropout: 0.0
|
| 23 |
+
bias: false
|
| 24 |
+
use_flash_attn: true
|
| 25 |
+
use_fp8: true
|
| 26 |
+
# Hybrid settings
|
| 27 |
+
use_hybrid: true
|
| 28 |
+
hybrid_pattern: "M M M M M M M M M M M M M A M M M M M M M M M M M M A M M M M M M M M M M M M A"
|
| 29 |
+
mamba_d_state: 128
|
| 30 |
+
mamba_head_dim: 64
|
| 31 |
+
mamba_expand: 2
|
| 32 |
+
mamba_conv_kernel: 4
|
| 33 |
+
mamba_n_groups: 1
|
| 34 |
+
mamba_chunk_size: 256
|
| 35 |
+
|
| 36 |
+
train:
|
| 37 |
+
max_steps: 57000
|
| 38 |
+
batch_size: 4
|
| 39 |
+
grad_accum_steps: 8
|
| 40 |
+
lr: 2e-4
|
| 41 |
+
weight_decay: 0.1
|
| 42 |
+
warmup_steps: 2000
|
| 43 |
+
max_grad_norm: 1.0
|
| 44 |
+
log_interval: 10
|
| 45 |
+
save_interval: 2000
|
| 46 |
+
eval_interval: 500
|
| 47 |
+
use_amp: false
|
| 48 |
+
compile_model: false
|
| 49 |
+
fp8_amax_history_len: 16
|
| 50 |
+
fp8_amax_compute_algo: "max"
|
| 51 |
+
fp8_format: "MXFP8"
|
| 52 |
+
|
| 53 |
+
tokenizer:
|
| 54 |
+
vocab_size: 64000
|
| 55 |
+
type: sentencepiece_unigram
|
source/configs/korean_1b.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean LLM 1B parameters — BF16 기본 설정
|
| 2 |
+
# B200 × 8 GPU 최적화, GQA(4:1) + SwiGLU + RoPE(long-context)
|
| 3 |
+
#
|
| 4 |
+
# 아키텍처 계산:
|
| 5 |
+
# d_ffn = int(2/3 * 4 * 2048) = 5461 → 16배수 올림 = 5472 (FP8 alignment)
|
| 6 |
+
# 실제 파라미터 수 ≈ 12 * 24 * 2048^2 = 1,207,959,552 (~1.2B)
|
| 7 |
+
#
|
| 8 |
+
# 학습 설정:
|
| 9 |
+
# eff_batch = 4(bs) * 8(GPU) * 8(accum) * 4096(seq) = 1,048,576 토큰/스텝
|
| 10 |
+
# 200,000 스텝 × 1M tok = 200B 토큰 처리
|
| 11 |
+
model:
|
| 12 |
+
vocab_size: 64000
|
| 13 |
+
d_model: 2048
|
| 14 |
+
n_layers: 24
|
| 15 |
+
n_heads: 16
|
| 16 |
+
n_kv_heads: 4 # GQA: 4 KV 그룹, 16 쿼리 헤드 (4:1 비율)
|
| 17 |
+
d_ffn: 5472 # SwiGLU: int(2/3 * 4 * 2048)=5461 → 16배수=5472
|
| 18 |
+
max_seq_len: 4096
|
| 19 |
+
rope_theta: 500000.0 # Llama-3 스타일 고주파 외삽 (장문 컨텍스트)
|
| 20 |
+
dropout: 0.0
|
| 21 |
+
bias: false
|
| 22 |
+
use_flash_attn: true
|
| 23 |
+
use_fp8: false # BF16 기본; FP8은 korean_1b_fp8.yaml 참조
|
| 24 |
+
|
| 25 |
+
train:
|
| 26 |
+
max_steps: 200000
|
| 27 |
+
batch_size: 4 # per GPU: 4 × 4096 = 16,384 토큰
|
| 28 |
+
grad_accum_steps: 8 # eff_batch: 4 × 8GPU × 8 × 4096 = 1,048,576 tok/step
|
| 29 |
+
lr: 2.0e-4
|
| 30 |
+
weight_decay: 0.1
|
| 31 |
+
warmup_steps: 4000
|
| 32 |
+
max_grad_norm: 1.0
|
| 33 |
+
log_interval: 10
|
| 34 |
+
save_interval: 1000
|
| 35 |
+
eval_interval: 500
|
| 36 |
+
use_amp: true # BF16 mixed precision
|
| 37 |
+
compile_model: false
|
| 38 |
+
|
| 39 |
+
tokenizer:
|
| 40 |
+
vocab_size: 64000
|
| 41 |
+
type: sentencepiece_unigram
|
source/configs/korean_1b_fp8.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean LLM 1B parameters — FP8 변형 (B200 TransformerEngine 네이티브)
|
| 2 |
+
#
|
| 3 |
+
# [최적화 근거 — 2026-02-25]
|
| 4 |
+
# - 데이터: korean_train.bin 8.91B tokens
|
| 5 |
+
# - max_steps 34000 = 4 에포크 (Muennighoff 2023: 4에포크 초과 시 val loss 상승)
|
| 6 |
+
# * 기존 200k steps = 23.5 에포크 → 오버피팅 위험, compute 낭비
|
| 7 |
+
# - lr=2e-4: GPT-3 1.3B 기준과 정확히 일치 (변경 없음)
|
| 8 |
+
# - eff_batch=1.05M: GPT-3 1.3B 기준과 일치 (변경 없음)
|
| 9 |
+
# - warmup 2000 = 34k의 5.9% (기존 4000 = 11.8%로 과도했음)
|
| 10 |
+
# - save/eval 간격 단축: 34k steps 기준 더 촘촘한 체크포인트 필요
|
| 11 |
+
# - compile_model: false (TE 2.10 graph break 위험, 안정성 우선)
|
| 12 |
+
#
|
| 13 |
+
# 실행: bash scripts/launch_korean_1b.sh
|
| 14 |
+
model:
|
| 15 |
+
vocab_size: 64000
|
| 16 |
+
d_model: 2048
|
| 17 |
+
n_layers: 24
|
| 18 |
+
n_heads: 16
|
| 19 |
+
n_kv_heads: 4 # GQA 4:1 (K/V 파라미터 75% 절감)
|
| 20 |
+
d_ffn: 5472 # 16배수 (FP8 alignment 충족)
|
| 21 |
+
max_seq_len: 4096
|
| 22 |
+
rope_theta: 500000.0
|
| 23 |
+
dropout: 0.0
|
| 24 |
+
bias: false
|
| 25 |
+
use_flash_attn: true
|
| 26 |
+
use_fp8: true # TransformerEngine MXFP8BlockScaling (B200 네이티브)
|
| 27 |
+
|
| 28 |
+
train:
|
| 29 |
+
# 34k steps × 1,048,576 tok/step = 35.6B tokens = 4 에포크 (8.91B 데이터 기준)
|
| 30 |
+
max_steps: 34000
|
| 31 |
+
batch_size: 8 # per GPU: 8 × 4096 = 32,768 토큰 | VRAM 30.8% 사용 (192GB)
|
| 32 |
+
grad_accum_steps: 4 # eff_batch: 8 × 8GPU × 4 × 4096 = 1,048,576 tok/step
|
| 33 |
+
lr: 2.0e-4 # GPT-3 1.3B 기준 최적값과 정확히 일치
|
| 34 |
+
weight_decay: 0.1
|
| 35 |
+
warmup_steps: 2000 # 34k steps의 5.9% — 기존 4000은 11.8%로 과도
|
| 36 |
+
max_grad_norm: 1.0
|
| 37 |
+
log_interval: 10
|
| 38 |
+
save_interval: 500 # 34k steps 기준 ~70 체크포인트 (기존 1000은 너무 듬성)
|
| 39 |
+
eval_interval: 200 # val loss 조기 이상 감지용
|
| 40 |
+
use_amp: false # fp8_autocast가 대체 (torch.autocast 불필요)
|
| 41 |
+
compile_model: false # TE 2.10 + DDP graph break 위험
|
| 42 |
+
fp8_amax_history_len: 16
|
| 43 |
+
fp8_amax_compute_algo: "max"
|
| 44 |
+
fp8_format: "MXFP8" # B200 Blackwell 네이티브 블록 스케일링
|
| 45 |
+
|
| 46 |
+
tokenizer:
|
| 47 |
+
vocab_size: 64000
|
| 48 |
+
type: sentencepiece_unigram
|
source/configs/korean_1b_sft.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean LLM 1B — SFT (Supervised Fine-Tuning) 설정
|
| 2 |
+
#
|
| 3 |
+
# Base model: korean_1b_fp8_run1/checkpoint-0034000 (1.19B params, 34k pretrain steps)
|
| 4 |
+
# SFT 목표: instruction following + 반복 퇴화 완화 + 생성 품질 향상
|
| 5 |
+
#
|
| 6 |
+
# 실행: bash scripts/launch_sft.sh
|
| 7 |
+
|
| 8 |
+
model:
|
| 9 |
+
vocab_size: 64000
|
| 10 |
+
d_model: 2048
|
| 11 |
+
n_layers: 24
|
| 12 |
+
n_heads: 16
|
| 13 |
+
n_kv_heads: 4
|
| 14 |
+
d_ffn: 5472
|
| 15 |
+
max_seq_len: 4096
|
| 16 |
+
rope_theta: 500000.0
|
| 17 |
+
dropout: 0.0
|
| 18 |
+
bias: false
|
| 19 |
+
use_flash_attn: true
|
| 20 |
+
use_fp8: true
|
| 21 |
+
|
| 22 |
+
train:
|
| 23 |
+
max_steps: 5000 # SFT: 수천 steps면 충분 (pretrain 34k 대비 ~10%)
|
| 24 |
+
batch_size: 4 # per GPU (SFT는 seq가 다양하므로 작게)
|
| 25 |
+
grad_accum_steps: 2 # eff_batch: 4 × 8GPU × 2 × 4096 = 262,144 tok/step
|
| 26 |
+
lr: 2.0e-5 # pretrain의 1/10 (catastrophic forgetting 방지)
|
| 27 |
+
weight_decay: 0.01 # pretrain 0.1보다 약하게
|
| 28 |
+
warmup_steps: 150 # 3000 steps의 3.3%
|
| 29 |
+
max_grad_norm: 1.0
|
| 30 |
+
log_interval: 10
|
| 31 |
+
save_interval: 500
|
| 32 |
+
eval_interval: 100
|
| 33 |
+
use_amp: false # FP8 사용 시 불필요
|
| 34 |
+
compile_model: false
|
| 35 |
+
fp8_amax_history_len: 16
|
| 36 |
+
fp8_amax_compute_algo: "max"
|
| 37 |
+
fp8_format: "MXFP8"
|
| 38 |
+
|
| 39 |
+
tokenizer:
|
| 40 |
+
vocab_size: 64000
|
| 41 |
+
type: sentencepiece_unigram
|
source/configs/korean_3b_fp8.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean LLM 3B parameters — FP8 (B200 TransformerEngine MXFP8)
|
| 2 |
+
#
|
| 3 |
+
# [설계 근거 — 2026-02-27]
|
| 4 |
+
# - 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
|
| 5 |
+
# - 파라미터: ~3.0B (embedding 포함)
|
| 6 |
+
# - 데이터: korean_train.bin 8.91B tokens → 최소 60B tokens (7 에포크)
|
| 7 |
+
# - Chinchilla optimal: 3B 모델 → 60B tokens, 실용적으로 100B 권장
|
| 8 |
+
# - lr=1.5e-4: LLaMA-3 3B 기준 (1B의 2e-4 대비 낮춤, μP scaling ~1/sqrt(3))
|
| 9 |
+
# - eff_batch=2M tokens: 3B 기준 GPT-3 scaling law 참고
|
| 10 |
+
# - 체크포인트: ~27GB/개, 2000 step 간격 → 최대 ~30개 = 810GB
|
| 11 |
+
# - 예상 학습 시간: 8×B200 FP8 기준 ~72-96시간 (60B tokens)
|
| 12 |
+
#
|
| 13 |
+
# 실행: bash scripts/launch_3b_pretrain.sh
|
| 14 |
+
model:
|
| 15 |
+
vocab_size: 64000
|
| 16 |
+
d_model: 3072
|
| 17 |
+
n_layers: 28
|
| 18 |
+
n_heads: 24
|
| 19 |
+
n_kv_heads: 8 # GQA 3:1 (메모리 효율 + 품질 밸런스)
|
| 20 |
+
d_ffn: 8192 # ~2.67× d_model, 128배수 (FP8 alignment)
|
| 21 |
+
max_seq_len: 4096
|
| 22 |
+
rope_theta: 500000.0
|
| 23 |
+
dropout: 0.0
|
| 24 |
+
bias: false
|
| 25 |
+
use_flash_attn: true
|
| 26 |
+
use_fp8: true
|
| 27 |
+
|
| 28 |
+
train:
|
| 29 |
+
# Phase 1: 60B tokens (최소) = 57000 steps × 2^20 tok/step
|
| 30 |
+
# Phase 2: 100B tokens (권장) = 95000 steps
|
| 31 |
+
max_steps: 57000
|
| 32 |
+
batch_size: 5 # per GPU: 5 × 4096 = 20,480 토큰 (QKV fusion 후 ~161GB/183GB VRAM, 21GB 여유)
|
| 33 |
+
grad_accum_steps: 8 # eff_batch: 5 × 8GPU × 8 × 4096 = 1,310,720 tok/step (~1.3M)
|
| 34 |
+
lr: 1.5e-4 # LLaMA-3 3B 스케일, Chinchilla 참고
|
| 35 |
+
weight_decay: 0.1
|
| 36 |
+
warmup_steps: 2000 # 57k의 3.5%
|
| 37 |
+
max_grad_norm: 1.0
|
| 38 |
+
log_interval: 10
|
| 39 |
+
save_interval: 2000 # 27GB/체크포인트 → 2000 step 간격 = ~28개 = 756GB
|
| 40 |
+
eval_interval: 500
|
| 41 |
+
use_amp: false
|
| 42 |
+
compile_model: false
|
| 43 |
+
fp8_amax_history_len: 16 # NOTE: MXFP8 format에서는 무시됨 (DelayedScaling 전용)
|
| 44 |
+
fp8_amax_compute_algo: "max" # NOTE: MXFP8 format에서는 무시됨
|
| 45 |
+
fp8_format: "MXFP8"
|
| 46 |
+
|
| 47 |
+
tokenizer:
|
| 48 |
+
vocab_size: 64000
|
| 49 |
+
type: sentencepiece_unigram
|
source/configs/korean_3b_orpo.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean 3B ORPO Configuration (Phase 3)
|
| 2 |
+
#
|
| 3 |
+
# Base model: SFT v1 best checkpoint (HF format)
|
| 4 |
+
# 목표: Greedy 반복률 73%→30% 이하, EOS 종료율 0%→80%+
|
| 5 |
+
#
|
| 6 |
+
# 실행:
|
| 7 |
+
# bash scripts/launch_3b_orpo.sh # 본 학습
|
| 8 |
+
# bash scripts/launch_3b_orpo.sh --max_steps 200 # 퀵 테스트
|
| 9 |
+
#
|
| 10 |
+
# [설계 근거]
|
| 11 |
+
# - beta=0.25: 반복률 73%는 극단적 → 강한 OR loss 필요 (기존 0.1에서 상향)
|
| 12 |
+
# - lr=1.2e-5: HP sweep 6-config 결과 최적 (eval_loss 1.862, margin +0.009)
|
| 13 |
+
# - epochs=2: 683K 규모에 3 epoch은 과적합 위험
|
| 14 |
+
# - max_length=1536: P95=880 tokens, 99%+ 커버 + VRAM 25% 절약
|
| 15 |
+
|
| 16 |
+
# Model
|
| 17 |
+
model_path: eval/outputs/hf_3b_sft_best
|
| 18 |
+
output_dir: checkpoints/korean_3b_orpo
|
| 19 |
+
|
| 20 |
+
# Training
|
| 21 |
+
epochs: 2
|
| 22 |
+
lr: 1.2e-5
|
| 23 |
+
beta: 0.25
|
| 24 |
+
batch_size: 4
|
| 25 |
+
gradient_accumulation_steps: 4
|
| 26 |
+
max_length: 1536
|
| 27 |
+
bf16: true
|
| 28 |
+
weight_decay: 0.01
|
| 29 |
+
seed: 42
|
| 30 |
+
|
| 31 |
+
# Scheduler
|
| 32 |
+
lr_scheduler_type: cosine
|
| 33 |
+
warmup_ratio: 0.05
|
| 34 |
+
|
| 35 |
+
# Evaluation & Early Stopping
|
| 36 |
+
eval_split_ratio: 0.05
|
| 37 |
+
eval_steps: 500
|
| 38 |
+
early_stopping_patience: 3
|
| 39 |
+
|
| 40 |
+
# Logging & checkpointing
|
| 41 |
+
logging_steps: 10
|
| 42 |
+
save_steps: 500
|
| 43 |
+
save_total_limit: 5
|
| 44 |
+
gradient_checkpointing: true
|
| 45 |
+
report_to: none
|
| 46 |
+
|
| 47 |
+
# Data
|
| 48 |
+
custom_data_path: data/preference/combined_preference.jsonl
|
| 49 |
+
dataset_num_proc: 64
|
source/configs/korean_3b_sft.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean 3B SFT Configuration
|
| 2 |
+
#
|
| 3 |
+
# Base model: checkpoints/korean_3b_fp8_run1/checkpoint-XXXXXX (3B params pretrained)
|
| 4 |
+
# SFT 목표: instruction following + 반복 퇴화 완화 + 생성 품질 향상
|
| 5 |
+
# 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
|
| 6 |
+
#
|
| 7 |
+
# 실행: bash scripts/launch_3b_sft.sh
|
| 8 |
+
#
|
| 9 |
+
# [설계 근거 — 2026-03-02]
|
| 10 |
+
# - batch: 2 × 8GPU × 4 grad_accum = 64 eff_batch
|
| 11 |
+
# - max_steps 33000 ≈ 3 epochs × 700K samples / 64 eff_batch
|
| 12 |
+
# - lr=1e-5: pretrain 1.5e-4의 1/15 (catastrophic forgetting 방지)
|
| 13 |
+
# - NEFTune alpha=5.0: 생성 다양성 향상, 반복 퇴화 완화
|
| 14 |
+
# - use_fp8=true: B200 MXFP8 네이티브 가속 유지
|
| 15 |
+
|
| 16 |
+
model:
|
| 17 |
+
vocab_size: 64000
|
| 18 |
+
d_model: 3072
|
| 19 |
+
n_layers: 28
|
| 20 |
+
n_heads: 24
|
| 21 |
+
n_kv_heads: 8
|
| 22 |
+
d_ffn: 8192
|
| 23 |
+
max_seq_len: 4096
|
| 24 |
+
rope_theta: 500000.0
|
| 25 |
+
dropout: 0.0
|
| 26 |
+
bias: false
|
| 27 |
+
use_flash_attn: true
|
| 28 |
+
use_fp8: true
|
| 29 |
+
|
| 30 |
+
train:
|
| 31 |
+
max_steps: 33000 # 3 epochs × 700K / 64 eff_batch
|
| 32 |
+
batch_size: 2 # per GPU (3B VRAM 절약)
|
| 33 |
+
grad_accum_steps: 4 # eff_batch: 2 × 8GPU × 4 = 64
|
| 34 |
+
lr: 1.0e-5 # catastrophic forgetting 방지
|
| 35 |
+
weight_decay: 0.01
|
| 36 |
+
warmup_steps: 500
|
| 37 |
+
max_grad_norm: 1.0
|
| 38 |
+
log_interval: 10
|
| 39 |
+
save_interval: 2000
|
| 40 |
+
eval_interval: 500
|
| 41 |
+
use_amp: false
|
| 42 |
+
compile_model: false
|
| 43 |
+
neftune_alpha: 5.0 # NEFTune noise injection
|
| 44 |
+
|
| 45 |
+
tokenizer:
|
| 46 |
+
vocab_size: 64000
|
| 47 |
+
type: sentencepiece_unigram
|
source/configs/korean_3b_sft_v2.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Korean 3B SFT v2 Configuration
|
| 2 |
+
#
|
| 3 |
+
# Base model: checkpoints/korean_3b_fp8_run1/checkpoint-0057000 (3B params pretrained)
|
| 4 |
+
# SFT v2 목표: v1의 underfitting 해결 + forgetting 방지 (data mixing)
|
| 5 |
+
# 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
|
| 6 |
+
#
|
| 7 |
+
# 실행: bash scripts/launch_3b_sft_v2.sh
|
| 8 |
+
#
|
| 9 |
+
# [설계 근거 — SFT v1 실패 분석 2026-03-06]
|
| 10 |
+
# v1 문제: lr=1e-5 → val_loss 변화 0 (사실상 학습 안 됨)
|
| 11 |
+
# v2 변경:
|
| 12 |
+
# - lr: 1e-5 → 5e-5 (5배 ↑, 3B SFT 표준 범위)
|
| 13 |
+
# - batch: 4 × 8GPU × 8 grad_accum = 256 eff_batch (v1 대비 4배 ↑)
|
| 14 |
+
# - warmup: 500 → 2000 (높은 LR에 맞춰 안정화)
|
| 15 |
+
# - max_steps: 33000 → 15000 (수렴 빨라짐, 과적합 방지)
|
| 16 |
+
# - weight_decay: 0.01 → 0.05 (forgetting 억제)
|
| 17 |
+
# - data mixing: SFT 70% + pretrain 30% (forgetting 방지)
|
| 18 |
+
|
| 19 |
+
model:
|
| 20 |
+
vocab_size: 64000
|
| 21 |
+
d_model: 3072
|
| 22 |
+
n_layers: 28
|
| 23 |
+
n_heads: 24
|
| 24 |
+
n_kv_heads: 8
|
| 25 |
+
d_ffn: 8192
|
| 26 |
+
max_seq_len: 4096
|
| 27 |
+
rope_theta: 500000.0
|
| 28 |
+
dropout: 0.0
|
| 29 |
+
bias: false
|
| 30 |
+
use_flash_attn: true
|
| 31 |
+
use_fp8: true
|
| 32 |
+
|
| 33 |
+
train:
|
| 34 |
+
max_steps: 15000 # v1 33000 → 15000 (수렴 빨라짐)
|
| 35 |
+
batch_size: 4 # v1 2 → 4 (VRAM 여유 충분: 48/183GB)
|
| 36 |
+
grad_accum_steps: 8 # v1 4 → 8 (eff_batch: 4 × 8GPU × 8 = 256)
|
| 37 |
+
lr: 5.0e-5 # v1 1e-5 → 5e-5 (5배 ↑, underfitting 해결)
|
| 38 |
+
weight_decay: 0.05 # v1 0.01 → 0.05 (forgetting 억제)
|
| 39 |
+
warmup_steps: 2000 # v1 500 → 2000 (높은 LR 안정화)
|
| 40 |
+
max_grad_norm: 1.0 # gradient clipping
|
| 41 |
+
log_interval: 10
|
| 42 |
+
save_interval: 2000
|
| 43 |
+
eval_interval: 500
|
| 44 |
+
use_amp: false
|
| 45 |
+
compile_model: false
|
| 46 |
+
neftune_alpha: 5.0 # NEFTune noise injection (유지)
|
| 47 |
+
|
| 48 |
+
# Data mixing (forgetting 방지)
|
| 49 |
+
pretrain_mix_ratio: 0.3 # pretrain 데이터 30% 혼합
|
| 50 |
+
pretrain_data: data/3b_train.bin # pretrain 데이터 경로
|
| 51 |
+
|
| 52 |
+
tokenizer:
|
| 53 |
+
vocab_size: 64000
|
| 54 |
+
type: sentencepiece_unigram
|
source/configs/medium.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Medium LLM ~350M parameters (GPT-2 medium equivalent)
|
| 2 |
+
model:
|
| 3 |
+
vocab_size: 32000
|
| 4 |
+
d_model: 1024
|
| 5 |
+
n_layers: 24
|
| 6 |
+
n_heads: 16
|
| 7 |
+
n_kv_heads: 8 # GQA: 2 KV heads per Q group
|
| 8 |
+
max_seq_len: 4096
|
| 9 |
+
rope_theta: 500000.0 # extended RoPE for longer context
|
| 10 |
+
dropout: 0.0
|
| 11 |
+
bias: false
|
| 12 |
+
use_flash_attn: true
|
| 13 |
+
|
| 14 |
+
train:
|
| 15 |
+
max_steps: 200000
|
| 16 |
+
batch_size: 4
|
| 17 |
+
grad_accum_steps: 8 # effective batch = 4 * 8 GPUs * 8 = 256
|
| 18 |
+
lr: 2.0e-4
|
| 19 |
+
weight_decay: 0.1
|
| 20 |
+
warmup_steps: 4000
|
| 21 |
+
max_grad_norm: 1.0
|
| 22 |
+
log_interval: 10
|
| 23 |
+
save_interval: 1000
|
| 24 |
+
eval_interval: 500
|
| 25 |
+
use_amp: true
|
| 26 |
+
compile_model: false
|
| 27 |
+
|
| 28 |
+
tokenizer:
|
| 29 |
+
vocab_size: 32000
|
| 30 |
+
type: bpe
|
source/configs/small.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Small LLM ~125M parameters (GPT-2 small equivalent)
|
| 2 |
+
model:
|
| 3 |
+
vocab_size: 32000
|
| 4 |
+
d_model: 768
|
| 5 |
+
n_layers: 12
|
| 6 |
+
n_heads: 12
|
| 7 |
+
n_kv_heads: 12 # MHA (same as n_heads)
|
| 8 |
+
max_seq_len: 2048
|
| 9 |
+
rope_theta: 10000.0
|
| 10 |
+
dropout: 0.0
|
| 11 |
+
bias: false
|
| 12 |
+
use_flash_attn: true
|
| 13 |
+
|
| 14 |
+
train:
|
| 15 |
+
max_steps: 100000
|
| 16 |
+
batch_size: 8 # per GPU
|
| 17 |
+
grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256
|
| 18 |
+
lr: 3.0e-4
|
| 19 |
+
weight_decay: 0.1
|
| 20 |
+
warmup_steps: 2000
|
| 21 |
+
max_grad_norm: 1.0
|
| 22 |
+
log_interval: 10
|
| 23 |
+
save_interval: 1000
|
| 24 |
+
eval_interval: 500
|
| 25 |
+
use_amp: true
|
| 26 |
+
compile_model: false
|
| 27 |
+
|
| 28 |
+
tokenizer:
|
| 29 |
+
vocab_size: 32000
|
| 30 |
+
type: bpe
|
source/configs/small_fp8.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Small LLM ~125M parameters — FP8 variant (B200 TransformerEngine)
|
| 2 |
+
# Based on small.yaml; only changed fields are listed explicitly.
|
| 3 |
+
model:
|
| 4 |
+
vocab_size: 32000
|
| 5 |
+
d_model: 768
|
| 6 |
+
n_layers: 12
|
| 7 |
+
n_heads: 12
|
| 8 |
+
n_kv_heads: 12 # MHA (same as n_heads)
|
| 9 |
+
max_seq_len: 2048
|
| 10 |
+
rope_theta: 10000.0
|
| 11 |
+
dropout: 0.0
|
| 12 |
+
bias: false
|
| 13 |
+
use_flash_attn: true
|
| 14 |
+
use_fp8: true # Enable TransformerEngine FP8 kernels
|
| 15 |
+
|
| 16 |
+
train:
|
| 17 |
+
max_steps: 100000
|
| 18 |
+
batch_size: 8 # per GPU; 8 * 2048 = 16384 tokens → divisible by 8 ✓
|
| 19 |
+
grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256
|
| 20 |
+
lr: 3.0e-4
|
| 21 |
+
weight_decay: 0.1
|
| 22 |
+
warmup_steps: 2000
|
| 23 |
+
max_grad_norm: 1.0
|
| 24 |
+
log_interval: 10
|
| 25 |
+
save_interval: 1000
|
| 26 |
+
eval_interval: 500
|
| 27 |
+
use_amp: false # fp8_autocast replaces torch.autocast
|
| 28 |
+
compile_model: false # torch.compile + TE 2.10 stability not verified
|
| 29 |
+
fp8_amax_history_len: 16
|
| 30 |
+
fp8_amax_compute_algo: "max"
|
| 31 |
+
fp8_format: "MXFP8" # B200 native block scaling (better than HYBRID on Blackwell)
|
| 32 |
+
|
| 33 |
+
tokenizer:
|
| 34 |
+
vocab_size: 32000
|
| 35 |
+
type: bpe
|