pathcosmos somebody-to-love commited on
Commit
09ea133
·
1 Parent(s): c0f89d0

Upload folder using huggingface_hub (#16)

Browse files

- Upload folder using huggingface_hub (da19444cc2b94051b3ee46aa2cc8793a2ea50b72)


Co-authored-by: pathcosmos <somebody-to-love@users.noreply.huggingface.co>

source/configs/3b_pretrain.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean LLM 3B parameters — FP8 (B200 TransformerEngine MXFP8)
2
+ #
3
+ # [아키텍처 근거 — 2026-02-27]
4
+ # - 저스티스리그 제안 기반: d_model=2560, 32L, 32H, 8KV
5
+ # - 파라미터: ~2.39B ("3B급" — Llama-3.2-3B 대비 경량, 한국어 64K vocab 효율)
6
+ # - d_ffn=6912: 2.7×d_model, 16배수 FP8 정렬
7
+ # - GQA 4:1 (32H:8KV) — 추론 효율 + KV cache 절약
8
+ # - head_dim=80 (2560/32) — Flash Attention 효율적
9
+ #
10
+ # [데이터/학습 설계]
11
+ # - 데이터: korean_train.bin 8.91B tokens
12
+ # - Chinchilla 최적: 2.4B × 20 = 48B tokens
13
+ # - 실제 목표: 60B tokens (6.7 에포크) — 한국어 단일 언어 특성상 추가 학습 유리
14
+ # - max_steps 57000 = 60B tokens / 1,048,576 tok/step
15
+ #
16
+ # [GPU 메모리 예측 — 8× B200 183GB]
17
+ # - 모델 FP8: 2.4 GB
18
+ # - Optimizer (bf16 master + fp32 mom/var): 23.9 GB
19
+ # - Gradient (bf16): 4.8 GB
20
+ # - Activation (per GPU, bs=8): ~27 GB
21
+ # - 합계: ~58 GB/GPU (31.7% 활용) → 여유 충분
22
+ #
23
+ # 실행: bash scripts/launch_korean_3b.sh
24
+ # 테스트: RUN_NAME=korean_3b_test bash scripts/launch_korean_3b.sh --max_steps 50
25
+
26
+ model:
27
+ vocab_size: 64000
28
+ d_model: 2560
29
+ n_layers: 32
30
+ n_heads: 32
31
+ n_kv_heads: 8 # GQA 4:1 (K/V 파라미터 75% 절감)
32
+ d_ffn: 6912 # 2.7×d_model, 16배수 (FP8 alignment)
33
+ max_seq_len: 4096
34
+ rope_theta: 500000.0
35
+ dropout: 0.0
36
+ bias: false
37
+ use_flash_attn: true
38
+ use_fp8: true # TransformerEngine MXFP8BlockScaling (B200 네이티브)
39
+
40
+ train:
41
+ # 57k steps × 1,048,576 tok/step = 59.8B tokens ≈ 6.7 에포크
42
+ max_steps: 57000
43
+ batch_size: 4 # per GPU: 4 × 4096 = 16,384 토큰 | VRAM ~130 GB (183GB의 71%)
44
+ grad_accum_steps: 8 # eff_batch: 4 × 8GPU × 8 × 4096 = 1,048,576 tok/step
45
+ lr: 1.5e-4 # 3B 규모: GPT-3 scaling 기준 1B(2e-4) → 3B(1.5e-4)
46
+ weight_decay: 0.1
47
+ warmup_steps: 2000 # 57k steps의 3.5% — 안정적 warmup
48
+ max_grad_norm: 1.0
49
+ log_interval: 10
50
+ save_interval: 1000 # 57k steps 기준 ~57 체크포인트
51
+ eval_interval: 500 # val loss 모니터링
52
+ use_amp: false # fp8_autocast가 대체
53
+ compile_model: false # TE 2.10 + DDP graph break 위험
54
+ fp8_amax_history_len: 16
55
+ fp8_amax_compute_algo: "max"
56
+ fp8_format: "MXFP8" # B200 Blackwell 네이티브 블록 스케일링
57
+
58
+ tokenizer:
59
+ vocab_size: 64000
60
+ type: sentencepiece_unigram
source/configs/clickhouse-config.xml ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <!--
3
+ ClickHouse Server Configuration
4
+ =================================
5
+ Hardware: AMD EPYC 9365 36-Core (72 threads), 2.2 TB RAM, 2x NUMA nodes
6
+ Purpose : Data analytics for LLM training pipeline + factory sensor data (CRM project)
7
+ Generated: 2026-03-01
8
+ -->
9
+ <clickhouse>
10
+
11
+ <!-- =========================================================
12
+ Network / Listen
13
+ Listen on localhost only — GPU training node is local only
14
+ ========================================================= -->
15
+ <listen_host>127.0.0.1</listen_host>
16
+ <http_port>8123</http_port>
17
+ <tcp_port>9000</tcp_port>
18
+ <interserver_http_port>9009</interserver_http_port>
19
+
20
+ <!-- =========================================================
21
+ Paths
22
+ Data on GPFS (20 TB, 18 TB free) for large datasets.
23
+ Tmp / logs on local /tmp to reduce GPFS small-file pressure.
24
+ ========================================================= -->
25
+ <path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/</path>
26
+ <tmp_path>/tmp/clickhouse-tmp/</tmp_path>
27
+
28
+ <!-- =========================================================
29
+ Logging
30
+ ========================================================= -->
31
+ <logger>
32
+ <level>information</level>
33
+ <log>/tmp/clickhouse/logs/clickhouse-server.log</log>
34
+ <errorlog>/tmp/clickhouse/logs/clickhouse-server.err.log</errorlog>
35
+ <!-- Rotate at 512 MB, keep 10 files -->
36
+ <size>536870912</size>
37
+ <count>10</count>
38
+ </logger>
39
+
40
+ <!-- =========================================================
41
+ Memory — server-level cap for all queries combined
42
+ ========================================================= -->
43
+ <max_server_memory_usage>536870912000</max_server_memory_usage>
44
+
45
+ <!-- =========================================================
46
+ CPU / Thread Pools (server-level settings)
47
+ Physical cores: 36 per socket, 72 total (2 NUMA nodes).
48
+ Reserve half for GPU training → 36 threads for ClickHouse.
49
+ ========================================================= -->
50
+ <background_pool_size>18</background_pool_size>
51
+ <background_merges_mutations_concurrency_ratio>2</background_merges_mutations_concurrency_ratio>
52
+ <background_move_pool_size>4</background_move_pool_size>
53
+ <background_fetches_pool_size>4</background_fetches_pool_size>
54
+ <background_schedule_pool_size>8</background_schedule_pool_size>
55
+ <background_common_pool_size>8</background_common_pool_size>
56
+
57
+ <!-- =========================================================
58
+ Concurrency
59
+ ========================================================= -->
60
+ <max_concurrent_queries>100</max_concurrent_queries>
61
+ <max_waiting_queries>50</max_waiting_queries>
62
+
63
+ <!-- =========================================================
64
+ Caches
65
+ mark_cache : 10 GB — indexes for MergeTree parts
66
+ uncompressed : 20 GB — decompressed block cache
67
+ query_cache : 2 GB — optional query result cache
68
+ NUMA note: ClickHouse allocates via jemalloc with NUMA
69
+ awareness; no extra config needed beyond thread binding.
70
+ ========================================================= -->
71
+ <mark_cache_size>10737418240</mark_cache_size>
72
+ <uncompressed_cache_size>21474836480</uncompressed_cache_size>
73
+ <!-- Available from ClickHouse 23.x -->
74
+ <query_cache>
75
+ <max_size_in_bytes>2147483648</max_size_in_bytes>
76
+ <max_entries>1024</max_entries>
77
+ <max_entry_size_in_bytes>104857600</max_entry_size_in_bytes>
78
+ <max_entry_size_in_rows>30000000</max_entry_size_in_rows>
79
+ </query_cache>
80
+ <!-- Compiled expression cache -->
81
+ <compiled_expression_cache_size>134217728</compiled_expression_cache_size>
82
+ <compiled_expression_cache_elements_size>10000</compiled_expression_cache_elements_size>
83
+
84
+ <!-- =========================================================
85
+ I/O
86
+ GPFS is a parallel filesystem — large sequential reads are
87
+ efficient; use aggressive read-ahead and prefetch.
88
+ NVMe local disks can be used for tmp / intermediate data.
89
+ ========================================================= -->
90
+ <!-- MergeTree concurrent read settings moved to <profiles><default> -->
91
+ <!-- Async reads from object/POSIX storage -->
92
+ <asynchronous_metrics_update_period_s>60</asynchronous_metrics_update_period_s>
93
+
94
+ <!-- Async Insert settings moved to <profiles><default> below -->
95
+
96
+ <!-- =========================================================
97
+ MergeTree Storage Settings
98
+ ========================================================= -->
99
+ <merge_tree>
100
+ <!-- Bytes: prefer larger parts on GPFS to reduce metadata overhead -->
101
+ <max_bytes_to_merge_at_max_space_in_pool>161061273600</max_bytes_to_merge_at_max_space_in_pool>
102
+ <!-- Allow up to 300 parts per partition before slowing inserts -->
103
+ <parts_to_throw_insert>300</parts_to_throw_insert>
104
+ <parts_to_delay_insert>150</parts_to_delay_insert>
105
+ <!-- Use AVX-512 SIMD for sorting / hashing where available -->
106
+ <use_minimalistic_part_header_in_zookeeper>1</use_minimalistic_part_header_in_zookeeper>
107
+ <!-- Keep deleted data up to 8 hours before final cleanup -->
108
+ <old_parts_lifetime>28800</old_parts_lifetime>
109
+ </merge_tree>
110
+
111
+ <!-- =========================================================
112
+ Compression
113
+ LZ4 default (fast, AVX2/AVX-512 accelerated).
114
+ ZSTD level 3 for cold / archival tables — trade CPU for space.
115
+ ========================================================= -->
116
+ <compression>
117
+ <!-- Hot data: LZ4 -->
118
+ <case>
119
+ <min_part_size>1073741824</min_part_size>
120
+ <min_part_size_ratio>0.01</min_part_size_ratio>
121
+ <method>lz4</method>
122
+ </case>
123
+ <!-- Very large parts: ZSTD for better ratio -->
124
+ <case>
125
+ <min_part_size>10737418240</min_part_size>
126
+ <min_part_size_ratio>0.1</min_part_size_ratio>
127
+ <method>zstd</method>
128
+ <level>3</level>
129
+ </case>
130
+ </compression>
131
+
132
+ <!-- =========================================================
133
+ Users / Access Control
134
+ Single local user, no password (localhost-only listen).
135
+ See users.xml (or inline below) for quota/profile.
136
+ ========================================================= -->
137
+ <users>
138
+ <default>
139
+ <password></password>
140
+ <networks>
141
+ <ip>127.0.0.1/8</ip>
142
+ <ip>::1</ip>
143
+ </networks>
144
+ <profile>default</profile>
145
+ <quota>default</quota>
146
+ <!-- Allow DDL from default user -->
147
+ <access_management>1</access_management>
148
+ </default>
149
+ </users>
150
+
151
+ <profiles>
152
+ <default>
153
+ <!-- Memory per query: 500 GB -->
154
+ <max_memory_usage>536870912000</max_memory_usage>
155
+ <max_bytes_before_external_group_by>483183820800</max_bytes_before_external_group_by>
156
+ <max_bytes_before_external_sort>483183820800</max_bytes_before_external_sort>
157
+ <!-- Threads per query: half of 72 cores -->
158
+ <max_threads>36</max_threads>
159
+ <max_concurrent_queries_for_user>100</max_concurrent_queries_for_user>
160
+ <use_uncompressed_cache>1</use_uncompressed_cache>
161
+ <!-- O_DIRECT for large scans -->
162
+ <min_bytes_to_use_direct_io>10737418240</min_bytes_to_use_direct_io>
163
+ <!-- SIMD JSON parsing (AVX-512) -->
164
+ <input_format_parallel_parsing>1</input_format_parallel_parsing>
165
+ <output_format_parallel_formatting>1</output_format_parallel_formatting>
166
+ <!-- Async insert for sensor/CRM streaming -->
167
+ <async_insert>1</async_insert>
168
+ <async_insert_max_data_size>33554432</async_insert_max_data_size>
169
+ <async_insert_busy_timeout_ms>200</async_insert_busy_timeout_ms>
170
+ <async_insert_deduplicate>0</async_insert_deduplicate>
171
+ <wait_for_async_insert>1</wait_for_async_insert>
172
+ <wait_for_async_insert_timeout>5</wait_for_async_insert_timeout>
173
+ <!-- MergeTree concurrent read -->
174
+ <merge_tree_min_rows_for_concurrent_read>20000</merge_tree_min_rows_for_concurrent_read>
175
+ <merge_tree_min_bytes_for_concurrent_read>24117248</merge_tree_min_bytes_for_concurrent_read>
176
+ </default>
177
+ </profiles>
178
+
179
+ <quotas>
180
+ <default>
181
+ <interval>
182
+ <duration>3600</duration>
183
+ <queries>0</queries>
184
+ <errors>0</errors>
185
+ <result_rows>0</result_rows>
186
+ <read_rows>0</read_rows>
187
+ <execution_time>0</execution_time>
188
+ </interval>
189
+ </default>
190
+ </quotas>
191
+
192
+ <!-- =========================================================
193
+ Distributed DDL (single-node — disable ZooKeeper dependency)
194
+ ========================================================= -->
195
+ <!-- No ZooKeeper configured; replicated tables use ReplicatedMergeTree
196
+ only if ZK is added later. Commenting out to avoid startup warnings.
197
+ <zookeeper>
198
+ <node>
199
+ <host>localhost</host>
200
+ <port>2181</port>
201
+ </node>
202
+ </zookeeper>
203
+ -->
204
+
205
+ <!-- =========================================================
206
+ Timezone
207
+ ========================================================= -->
208
+ <timezone>Asia/Seoul</timezone>
209
+
210
+ <!-- =========================================================
211
+ Query Log / System Tables
212
+ Keep 30 days of query history for pipeline debugging.
213
+ ========================================================= -->
214
+ <query_log>
215
+ <database>system</database>
216
+ <table>query_log</table>
217
+ <partition_by>toYYYYMM(event_date)</partition_by>
218
+ <ttl>event_date + INTERVAL 30 DAY</ttl>
219
+ <flush_interval_milliseconds>7500</flush_interval_milliseconds>
220
+ <max_size_rows>1048576</max_size_rows>
221
+ </query_log>
222
+
223
+ <query_thread_log>
224
+ <database>system</database>
225
+ <table>query_thread_log</table>
226
+ <partition_by>toYYYYMM(event_date)</partition_by>
227
+ <ttl>event_date + INTERVAL 7 DAY</ttl>
228
+ <flush_interval_milliseconds>7500</flush_interval_milliseconds>
229
+ </query_thread_log>
230
+
231
+ <part_log>
232
+ <database>system</database>
233
+ <table>part_log</table>
234
+ <partition_by>toYYYYMM(event_date)</partition_by>
235
+ <ttl>event_date + INTERVAL 14 DAY</ttl>
236
+ <flush_interval_milliseconds>5000</flush_interval_milliseconds>
237
+ </part_log>
238
+
239
+ <trace_log>
240
+ <database>system</database>
241
+ <table>trace_log</table>
242
+ <partition_by>toYYYYMM(event_date)</partition_by>
243
+ <ttl>event_date + INTERVAL 7 DAY</ttl>
244
+ <flush_interval_milliseconds>7500</flush_interval_milliseconds>
245
+ </trace_log>
246
+
247
+ <metric_log>
248
+ <database>system</database>
249
+ <table>metric_log</table>
250
+ <flush_interval_milliseconds>7500</flush_interval_milliseconds>
251
+ <collect_interval_milliseconds>1000</collect_interval_milliseconds>
252
+ <ttl>event_date + INTERVAL 7 DAY</ttl>
253
+ </metric_log>
254
+
255
+ <asynchronous_metric_log>
256
+ <database>system</database>
257
+ <table>asynchronous_metric_log</table>
258
+ <flush_interval_milliseconds>7500</flush_interval_milliseconds>
259
+ <ttl>event_date + INTERVAL 7 DAY</ttl>
260
+ </asynchronous_metric_log>
261
+
262
+ <!-- =========================================================
263
+ Crash Handler
264
+ ========================================================= -->
265
+ <core_dump>
266
+ <size_limit>0</size_limit>
267
+ </core_dump>
268
+
269
+ <!-- =========================================================
270
+ Keeper (built-in, single-node mode — replaces ZooKeeper
271
+ if you want ReplicatedMergeTree without external ZK).
272
+ Uncomment if needed.
273
+ ========================================================= -->
274
+ <!--
275
+ <keeper_server>
276
+ <tcp_port>9181</tcp_port>
277
+ <server_id>1</server_id>
278
+ <log_storage_path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/keeper/logs</log_storage_path>
279
+ <snapshot_storage_path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/keeper/snapshots</snapshot_storage_path>
280
+ <coordination_settings>
281
+ <operation_timeout_ms>10000</operation_timeout_ms>
282
+ <session_timeout_ms>30000</session_timeout_ms>
283
+ <raft_logs_level>warning</raft_logs_level>
284
+ </coordination_settings>
285
+ <raft_configuration>
286
+ <server>
287
+ <id>1</id>
288
+ <hostname>localhost</hostname>
289
+ <port>9444</port>
290
+ </server>
291
+ </raft_configuration>
292
+ </keeper_server>
293
+ -->
294
+
295
+ <!-- =========================================================
296
+ AVX-512 / SIMD hints
297
+ ClickHouse auto-detects CPUID at runtime; these flags are
298
+ informational comments — no XML knobs needed.
299
+ Detected: avx512f, avx512bw, avx512vl, avx512_vnni, avx512_bf16
300
+ Used in: LZ4 compression, hash aggregation, sorting, filters.
301
+ ========================================================= -->
302
+
303
+ <!-- =========================================================
304
+ Miscellaneous
305
+ ========================================================= -->
306
+ <!-- Skip strict settings check for forward-compat -->
307
+ <skip_check_for_incorrect_settings>1</skip_check_for_incorrect_settings>
308
+ <!-- Graceful shutdown: wait up to 60 s for running queries -->
309
+ <shutdown_wait_unfinished>60</shutdown_wait_unfinished>
310
+ <!-- Send anonymous usage statistics: off for private server -->
311
+ <send_crash_reports>
312
+ <enabled>false</enabled>
313
+ </send_crash_reports>
314
+
315
+ </clickhouse>
source/configs/hybrid_3b.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer
2
+ #
3
+ # [설계 근거 — 2026-03-05]
4
+ # - 아키텍처: Nemotron-H 8B Dense 참고, 3B 스케일 적용
5
+ # - 40 layers: 37 Mamba-2 + 3 Attention (layer 13, 26, 39)
6
+ # - 파라미터: ~2.9B (embedding 포함)
7
+ # - 데이터: 3b_train.bin (기존 Pure Transformer 동일 데이터)
8
+ # - lr=2e-4: Mamba-2 논문 참고, Transformer보다 약간 높음
9
+ # - Attention 3개: 초반(13), 중반(26), 후반(39) 균등 배치
10
+ # - Mamba 장점: O(n) 시퀀스 처리, 추론 시 constant memory
11
+ #
12
+ # 실행: bash scripts/launch_hybrid_3b.sh
13
+ model:
14
+ vocab_size: 64000
15
+ d_model: 3072
16
+ n_layers: 40
17
+ n_heads: 24
18
+ n_kv_heads: 8
19
+ d_ffn: 9216
20
+ max_seq_len: 4096
21
+ rope_theta: 500000.0
22
+ dropout: 0.0
23
+ bias: false
24
+ use_flash_attn: true
25
+ use_fp8: true
26
+ # Hybrid settings
27
+ use_hybrid: true
28
+ hybrid_pattern: "M M M M M M M M M M M M M A M M M M M M M M M M M M A M M M M M M M M M M M M A"
29
+ mamba_d_state: 128
30
+ mamba_head_dim: 64
31
+ mamba_expand: 2
32
+ mamba_conv_kernel: 4
33
+ mamba_n_groups: 1
34
+ mamba_chunk_size: 256
35
+
36
+ train:
37
+ max_steps: 57000
38
+ batch_size: 4
39
+ grad_accum_steps: 8
40
+ lr: 2e-4
41
+ weight_decay: 0.1
42
+ warmup_steps: 2000
43
+ max_grad_norm: 1.0
44
+ log_interval: 10
45
+ save_interval: 2000
46
+ eval_interval: 500
47
+ use_amp: false
48
+ compile_model: false
49
+ fp8_amax_history_len: 16
50
+ fp8_amax_compute_algo: "max"
51
+ fp8_format: "MXFP8"
52
+
53
+ tokenizer:
54
+ vocab_size: 64000
55
+ type: sentencepiece_unigram
source/configs/korean_1b.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean LLM 1B parameters — BF16 기본 설정
2
+ # B200 × 8 GPU 최적화, GQA(4:1) + SwiGLU + RoPE(long-context)
3
+ #
4
+ # 아키텍처 계산:
5
+ # d_ffn = int(2/3 * 4 * 2048) = 5461 → 16배수 올림 = 5472 (FP8 alignment)
6
+ # 실제 파라미터 수 ≈ 12 * 24 * 2048^2 = 1,207,959,552 (~1.2B)
7
+ #
8
+ # 학습 설정:
9
+ # eff_batch = 4(bs) * 8(GPU) * 8(accum) * 4096(seq) = 1,048,576 토큰/스텝
10
+ # 200,000 스텝 × 1M tok = 200B 토큰 처리
11
+ model:
12
+ vocab_size: 64000
13
+ d_model: 2048
14
+ n_layers: 24
15
+ n_heads: 16
16
+ n_kv_heads: 4 # GQA: 4 KV 그룹, 16 쿼리 헤드 (4:1 비율)
17
+ d_ffn: 5472 # SwiGLU: int(2/3 * 4 * 2048)=5461 → 16배수=5472
18
+ max_seq_len: 4096
19
+ rope_theta: 500000.0 # Llama-3 스타일 고주파 외삽 (장문 컨텍스트)
20
+ dropout: 0.0
21
+ bias: false
22
+ use_flash_attn: true
23
+ use_fp8: false # BF16 기본; FP8은 korean_1b_fp8.yaml 참조
24
+
25
+ train:
26
+ max_steps: 200000
27
+ batch_size: 4 # per GPU: 4 × 4096 = 16,384 토큰
28
+ grad_accum_steps: 8 # eff_batch: 4 × 8GPU × 8 × 4096 = 1,048,576 tok/step
29
+ lr: 2.0e-4
30
+ weight_decay: 0.1
31
+ warmup_steps: 4000
32
+ max_grad_norm: 1.0
33
+ log_interval: 10
34
+ save_interval: 1000
35
+ eval_interval: 500
36
+ use_amp: true # BF16 mixed precision
37
+ compile_model: false
38
+
39
+ tokenizer:
40
+ vocab_size: 64000
41
+ type: sentencepiece_unigram
source/configs/korean_1b_fp8.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean LLM 1B parameters — FP8 변형 (B200 TransformerEngine 네이티브)
2
+ #
3
+ # [최적화 근거 — 2026-02-25]
4
+ # - 데이터: korean_train.bin 8.91B tokens
5
+ # - max_steps 34000 = 4 에포크 (Muennighoff 2023: 4에포크 초과 시 val loss 상승)
6
+ # * 기존 200k steps = 23.5 에포크 → 오버피팅 위험, compute 낭비
7
+ # - lr=2e-4: GPT-3 1.3B 기준과 정확히 일치 (변경 없음)
8
+ # - eff_batch=1.05M: GPT-3 1.3B 기준과 일치 (변경 없음)
9
+ # - warmup 2000 = 34k의 5.9% (기존 4000 = 11.8%로 과도했음)
10
+ # - save/eval 간격 단축: 34k steps 기준 더 촘촘한 체크포인트 필요
11
+ # - compile_model: false (TE 2.10 graph break 위험, 안정성 우선)
12
+ #
13
+ # 실행: bash scripts/launch_korean_1b.sh
14
+ model:
15
+ vocab_size: 64000
16
+ d_model: 2048
17
+ n_layers: 24
18
+ n_heads: 16
19
+ n_kv_heads: 4 # GQA 4:1 (K/V 파라미터 75% 절감)
20
+ d_ffn: 5472 # 16배수 (FP8 alignment 충족)
21
+ max_seq_len: 4096
22
+ rope_theta: 500000.0
23
+ dropout: 0.0
24
+ bias: false
25
+ use_flash_attn: true
26
+ use_fp8: true # TransformerEngine MXFP8BlockScaling (B200 네이티브)
27
+
28
+ train:
29
+ # 34k steps × 1,048,576 tok/step = 35.6B tokens = 4 에포크 (8.91B 데이터 기준)
30
+ max_steps: 34000
31
+ batch_size: 8 # per GPU: 8 × 4096 = 32,768 토큰 | VRAM 30.8% 사용 (192GB)
32
+ grad_accum_steps: 4 # eff_batch: 8 × 8GPU × 4 × 4096 = 1,048,576 tok/step
33
+ lr: 2.0e-4 # GPT-3 1.3B 기준 최적값과 정확히 일치
34
+ weight_decay: 0.1
35
+ warmup_steps: 2000 # 34k steps의 5.9% — 기존 4000은 11.8%로 과도
36
+ max_grad_norm: 1.0
37
+ log_interval: 10
38
+ save_interval: 500 # 34k steps 기준 ~70 체크포인트 (기존 1000은 너무 듬성)
39
+ eval_interval: 200 # val loss 조기 이상 감지용
40
+ use_amp: false # fp8_autocast가 대체 (torch.autocast 불필요)
41
+ compile_model: false # TE 2.10 + DDP graph break 위험
42
+ fp8_amax_history_len: 16
43
+ fp8_amax_compute_algo: "max"
44
+ fp8_format: "MXFP8" # B200 Blackwell 네이티브 블록 스케일링
45
+
46
+ tokenizer:
47
+ vocab_size: 64000
48
+ type: sentencepiece_unigram
source/configs/korean_1b_sft.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean LLM 1B — SFT (Supervised Fine-Tuning) 설정
2
+ #
3
+ # Base model: korean_1b_fp8_run1/checkpoint-0034000 (1.19B params, 34k pretrain steps)
4
+ # SFT 목표: instruction following + 반복 퇴화 완화 + 생성 품질 향상
5
+ #
6
+ # 실행: bash scripts/launch_sft.sh
7
+
8
+ model:
9
+ vocab_size: 64000
10
+ d_model: 2048
11
+ n_layers: 24
12
+ n_heads: 16
13
+ n_kv_heads: 4
14
+ d_ffn: 5472
15
+ max_seq_len: 4096
16
+ rope_theta: 500000.0
17
+ dropout: 0.0
18
+ bias: false
19
+ use_flash_attn: true
20
+ use_fp8: true
21
+
22
+ train:
23
+ max_steps: 5000 # SFT: 수천 steps면 충분 (pretrain 34k 대비 ~10%)
24
+ batch_size: 4 # per GPU (SFT는 seq가 다양하므로 작게)
25
+ grad_accum_steps: 2 # eff_batch: 4 × 8GPU × 2 × 4096 = 262,144 tok/step
26
+ lr: 2.0e-5 # pretrain의 1/10 (catastrophic forgetting 방지)
27
+ weight_decay: 0.01 # pretrain 0.1보다 약하게
28
+ warmup_steps: 150 # 3000 steps의 3.3%
29
+ max_grad_norm: 1.0
30
+ log_interval: 10
31
+ save_interval: 500
32
+ eval_interval: 100
33
+ use_amp: false # FP8 사용 시 불필요
34
+ compile_model: false
35
+ fp8_amax_history_len: 16
36
+ fp8_amax_compute_algo: "max"
37
+ fp8_format: "MXFP8"
38
+
39
+ tokenizer:
40
+ vocab_size: 64000
41
+ type: sentencepiece_unigram
source/configs/korean_3b_fp8.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean LLM 3B parameters — FP8 (B200 TransformerEngine MXFP8)
2
+ #
3
+ # [설계 근거 — 2026-02-27]
4
+ # - 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
5
+ # - 파라미터: ~3.0B (embedding 포함)
6
+ # - 데이터: korean_train.bin 8.91B tokens → 최소 60B tokens (7 에포크)
7
+ # - Chinchilla optimal: 3B 모델 → 60B tokens, 실용적으로 100B 권장
8
+ # - lr=1.5e-4: LLaMA-3 3B 기준 (1B의 2e-4 대비 낮춤, μP scaling ~1/sqrt(3))
9
+ # - eff_batch=2M tokens: 3B 기준 GPT-3 scaling law 참고
10
+ # - 체크포인트: ~27GB/개, 2000 step 간격 → 최대 ~30개 = 810GB
11
+ # - 예상 학습 시간: 8×B200 FP8 기준 ~72-96시간 (60B tokens)
12
+ #
13
+ # 실행: bash scripts/launch_3b_pretrain.sh
14
+ model:
15
+ vocab_size: 64000
16
+ d_model: 3072
17
+ n_layers: 28
18
+ n_heads: 24
19
+ n_kv_heads: 8 # GQA 3:1 (메모리 효율 + 품질 밸런스)
20
+ d_ffn: 8192 # ~2.67× d_model, 128배수 (FP8 alignment)
21
+ max_seq_len: 4096
22
+ rope_theta: 500000.0
23
+ dropout: 0.0
24
+ bias: false
25
+ use_flash_attn: true
26
+ use_fp8: true
27
+
28
+ train:
29
+ # Phase 1: 60B tokens (최소) = 57000 steps × 2^20 tok/step
30
+ # Phase 2: 100B tokens (권장) = 95000 steps
31
+ max_steps: 57000
32
+ batch_size: 5 # per GPU: 5 × 4096 = 20,480 토큰 (QKV fusion 후 ~161GB/183GB VRAM, 21GB 여유)
33
+ grad_accum_steps: 8 # eff_batch: 5 × 8GPU × 8 × 4096 = 1,310,720 tok/step (~1.3M)
34
+ lr: 1.5e-4 # LLaMA-3 3B 스케일, Chinchilla 참고
35
+ weight_decay: 0.1
36
+ warmup_steps: 2000 # 57k의 3.5%
37
+ max_grad_norm: 1.0
38
+ log_interval: 10
39
+ save_interval: 2000 # 27GB/체크포인트 → 2000 step 간격 = ~28개 = 756GB
40
+ eval_interval: 500
41
+ use_amp: false
42
+ compile_model: false
43
+ fp8_amax_history_len: 16 # NOTE: MXFP8 format에서는 무시됨 (DelayedScaling 전용)
44
+ fp8_amax_compute_algo: "max" # NOTE: MXFP8 format에서는 무시됨
45
+ fp8_format: "MXFP8"
46
+
47
+ tokenizer:
48
+ vocab_size: 64000
49
+ type: sentencepiece_unigram
source/configs/korean_3b_orpo.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean 3B ORPO Configuration (Phase 3)
2
+ #
3
+ # Base model: SFT v1 best checkpoint (HF format)
4
+ # 목표: Greedy 반복률 73%→30% 이하, EOS 종료율 0%→80%+
5
+ #
6
+ # 실행:
7
+ # bash scripts/launch_3b_orpo.sh # 본 학습
8
+ # bash scripts/launch_3b_orpo.sh --max_steps 200 # 퀵 테스트
9
+ #
10
+ # [설계 근거]
11
+ # - beta=0.25: 반복률 73%는 극단적 → 강한 OR loss 필요 (기존 0.1에서 상향)
12
+ # - lr=1.2e-5: HP sweep 6-config 결과 최적 (eval_loss 1.862, margin +0.009)
13
+ # - epochs=2: 683K 규모에 3 epoch은 과적합 위험
14
+ # - max_length=1536: P95=880 tokens, 99%+ 커버 + VRAM 25% 절약
15
+
16
+ # Model
17
+ model_path: eval/outputs/hf_3b_sft_best
18
+ output_dir: checkpoints/korean_3b_orpo
19
+
20
+ # Training
21
+ epochs: 2
22
+ lr: 1.2e-5
23
+ beta: 0.25
24
+ batch_size: 4
25
+ gradient_accumulation_steps: 4
26
+ max_length: 1536
27
+ bf16: true
28
+ weight_decay: 0.01
29
+ seed: 42
30
+
31
+ # Scheduler
32
+ lr_scheduler_type: cosine
33
+ warmup_ratio: 0.05
34
+
35
+ # Evaluation & Early Stopping
36
+ eval_split_ratio: 0.05
37
+ eval_steps: 500
38
+ early_stopping_patience: 3
39
+
40
+ # Logging & checkpointing
41
+ logging_steps: 10
42
+ save_steps: 500
43
+ save_total_limit: 5
44
+ gradient_checkpointing: true
45
+ report_to: none
46
+
47
+ # Data
48
+ custom_data_path: data/preference/combined_preference.jsonl
49
+ dataset_num_proc: 64
source/configs/korean_3b_sft.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean 3B SFT Configuration
2
+ #
3
+ # Base model: checkpoints/korean_3b_fp8_run1/checkpoint-XXXXXX (3B params pretrained)
4
+ # SFT 목표: instruction following + 반복 퇴화 완화 + 생성 품질 향상
5
+ # 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
6
+ #
7
+ # 실행: bash scripts/launch_3b_sft.sh
8
+ #
9
+ # [설계 근거 — 2026-03-02]
10
+ # - batch: 2 × 8GPU × 4 grad_accum = 64 eff_batch
11
+ # - max_steps 33000 ≈ 3 epochs × 700K samples / 64 eff_batch
12
+ # - lr=1e-5: pretrain 1.5e-4의 1/15 (catastrophic forgetting 방지)
13
+ # - NEFTune alpha=5.0: 생성 다양성 향상, 반복 퇴화 완화
14
+ # - use_fp8=true: B200 MXFP8 네이티브 가속 유지
15
+
16
+ model:
17
+ vocab_size: 64000
18
+ d_model: 3072
19
+ n_layers: 28
20
+ n_heads: 24
21
+ n_kv_heads: 8
22
+ d_ffn: 8192
23
+ max_seq_len: 4096
24
+ rope_theta: 500000.0
25
+ dropout: 0.0
26
+ bias: false
27
+ use_flash_attn: true
28
+ use_fp8: true
29
+
30
+ train:
31
+ max_steps: 33000 # 3 epochs × 700K / 64 eff_batch
32
+ batch_size: 2 # per GPU (3B VRAM 절약)
33
+ grad_accum_steps: 4 # eff_batch: 2 × 8GPU × 4 = 64
34
+ lr: 1.0e-5 # catastrophic forgetting 방지
35
+ weight_decay: 0.01
36
+ warmup_steps: 500
37
+ max_grad_norm: 1.0
38
+ log_interval: 10
39
+ save_interval: 2000
40
+ eval_interval: 500
41
+ use_amp: false
42
+ compile_model: false
43
+ neftune_alpha: 5.0 # NEFTune noise injection
44
+
45
+ tokenizer:
46
+ vocab_size: 64000
47
+ type: sentencepiece_unigram
source/configs/korean_3b_sft_v2.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Korean 3B SFT v2 Configuration
2
+ #
3
+ # Base model: checkpoints/korean_3b_fp8_run1/checkpoint-0057000 (3B params pretrained)
4
+ # SFT v2 목표: v1의 underfitting 해결 + forgetting 방지 (data mixing)
5
+ # 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
6
+ #
7
+ # 실행: bash scripts/launch_3b_sft_v2.sh
8
+ #
9
+ # [설계 근거 — SFT v1 실패 분석 2026-03-06]
10
+ # v1 문제: lr=1e-5 → val_loss 변화 0 (사실상 학습 안 됨)
11
+ # v2 변경:
12
+ # - lr: 1e-5 → 5e-5 (5배 ↑, 3B SFT 표준 범위)
13
+ # - batch: 4 × 8GPU × 8 grad_accum = 256 eff_batch (v1 대비 4배 ↑)
14
+ # - warmup: 500 → 2000 (높은 LR에 맞춰 안정화)
15
+ # - max_steps: 33000 → 15000 (수렴 빨라짐, 과적합 방지)
16
+ # - weight_decay: 0.01 → 0.05 (forgetting 억제)
17
+ # - data mixing: SFT 70% + pretrain 30% (forgetting 방지)
18
+
19
+ model:
20
+ vocab_size: 64000
21
+ d_model: 3072
22
+ n_layers: 28
23
+ n_heads: 24
24
+ n_kv_heads: 8
25
+ d_ffn: 8192
26
+ max_seq_len: 4096
27
+ rope_theta: 500000.0
28
+ dropout: 0.0
29
+ bias: false
30
+ use_flash_attn: true
31
+ use_fp8: true
32
+
33
+ train:
34
+ max_steps: 15000 # v1 33000 → 15000 (수렴 빨라짐)
35
+ batch_size: 4 # v1 2 → 4 (VRAM 여유 충분: 48/183GB)
36
+ grad_accum_steps: 8 # v1 4 → 8 (eff_batch: 4 × 8GPU × 8 = 256)
37
+ lr: 5.0e-5 # v1 1e-5 → 5e-5 (5배 ↑, underfitting 해결)
38
+ weight_decay: 0.05 # v1 0.01 → 0.05 (forgetting 억제)
39
+ warmup_steps: 2000 # v1 500 → 2000 (높은 LR 안정화)
40
+ max_grad_norm: 1.0 # gradient clipping
41
+ log_interval: 10
42
+ save_interval: 2000
43
+ eval_interval: 500
44
+ use_amp: false
45
+ compile_model: false
46
+ neftune_alpha: 5.0 # NEFTune noise injection (유지)
47
+
48
+ # Data mixing (forgetting 방지)
49
+ pretrain_mix_ratio: 0.3 # pretrain 데이터 30% 혼합
50
+ pretrain_data: data/3b_train.bin # pretrain 데이터 경로
51
+
52
+ tokenizer:
53
+ vocab_size: 64000
54
+ type: sentencepiece_unigram
source/configs/medium.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Medium LLM ~350M parameters (GPT-2 medium equivalent)
2
+ model:
3
+ vocab_size: 32000
4
+ d_model: 1024
5
+ n_layers: 24
6
+ n_heads: 16
7
+ n_kv_heads: 8 # GQA: 2 KV heads per Q group
8
+ max_seq_len: 4096
9
+ rope_theta: 500000.0 # extended RoPE for longer context
10
+ dropout: 0.0
11
+ bias: false
12
+ use_flash_attn: true
13
+
14
+ train:
15
+ max_steps: 200000
16
+ batch_size: 4
17
+ grad_accum_steps: 8 # effective batch = 4 * 8 GPUs * 8 = 256
18
+ lr: 2.0e-4
19
+ weight_decay: 0.1
20
+ warmup_steps: 4000
21
+ max_grad_norm: 1.0
22
+ log_interval: 10
23
+ save_interval: 1000
24
+ eval_interval: 500
25
+ use_amp: true
26
+ compile_model: false
27
+
28
+ tokenizer:
29
+ vocab_size: 32000
30
+ type: bpe
source/configs/small.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Small LLM ~125M parameters (GPT-2 small equivalent)
2
+ model:
3
+ vocab_size: 32000
4
+ d_model: 768
5
+ n_layers: 12
6
+ n_heads: 12
7
+ n_kv_heads: 12 # MHA (same as n_heads)
8
+ max_seq_len: 2048
9
+ rope_theta: 10000.0
10
+ dropout: 0.0
11
+ bias: false
12
+ use_flash_attn: true
13
+
14
+ train:
15
+ max_steps: 100000
16
+ batch_size: 8 # per GPU
17
+ grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256
18
+ lr: 3.0e-4
19
+ weight_decay: 0.1
20
+ warmup_steps: 2000
21
+ max_grad_norm: 1.0
22
+ log_interval: 10
23
+ save_interval: 1000
24
+ eval_interval: 500
25
+ use_amp: true
26
+ compile_model: false
27
+
28
+ tokenizer:
29
+ vocab_size: 32000
30
+ type: bpe
source/configs/small_fp8.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Small LLM ~125M parameters — FP8 variant (B200 TransformerEngine)
2
+ # Based on small.yaml; only changed fields are listed explicitly.
3
+ model:
4
+ vocab_size: 32000
5
+ d_model: 768
6
+ n_layers: 12
7
+ n_heads: 12
8
+ n_kv_heads: 12 # MHA (same as n_heads)
9
+ max_seq_len: 2048
10
+ rope_theta: 10000.0
11
+ dropout: 0.0
12
+ bias: false
13
+ use_flash_attn: true
14
+ use_fp8: true # Enable TransformerEngine FP8 kernels
15
+
16
+ train:
17
+ max_steps: 100000
18
+ batch_size: 8 # per GPU; 8 * 2048 = 16384 tokens → divisible by 8 ✓
19
+ grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256
20
+ lr: 3.0e-4
21
+ weight_decay: 0.1
22
+ warmup_steps: 2000
23
+ max_grad_norm: 1.0
24
+ log_interval: 10
25
+ save_interval: 1000
26
+ eval_interval: 500
27
+ use_amp: false # fp8_autocast replaces torch.autocast
28
+ compile_model: false # torch.compile + TE 2.10 stability not verified
29
+ fp8_amax_history_len: 16
30
+ fp8_amax_compute_algo: "max"
31
+ fp8_format: "MXFP8" # B200 native block scaling (better than HYBRID on Blackwell)
32
+
33
+ tokenizer:
34
+ vocab_size: 32000
35
+ type: bpe