Upload folder using huggingface_hub (#16)

- Upload folder using huggingface_hub (da19444cc2b94051b3ee46aa2cc8793a2ea50b72)

Co-authored-by: pathcosmos <somebody-to-love@users.noreply.huggingface.co>

Files changed (13) hide show

source/configs/3b_pretrain.yaml +60 -0
source/configs/clickhouse-config.xml +315 -0
source/configs/hybrid_3b.yaml +55 -0
source/configs/korean_1b.yaml +41 -0
source/configs/korean_1b_fp8.yaml +48 -0
source/configs/korean_1b_sft.yaml +41 -0
source/configs/korean_3b_fp8.yaml +49 -0
source/configs/korean_3b_orpo.yaml +49 -0
source/configs/korean_3b_sft.yaml +47 -0
source/configs/korean_3b_sft_v2.yaml +54 -0
source/configs/medium.yaml +30 -0
source/configs/small.yaml +30 -0
source/configs/small_fp8.yaml +35 -0

source/configs/3b_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+# Korean LLM 3B parameters — FP8 (B200 TransformerEngine MXFP8)
+#
+# [아키텍처 근거 — 2026-02-27]
+#   - 저스티스리그 제안 기반: d_model=2560, 32L, 32H, 8KV
+#   - 파라미터: ~2.39B ("3B급" — Llama-3.2-3B 대비 경량, 한국어 64K vocab 효율)
+#   - d_ffn=6912: 2.7×d_model, 16배수 FP8 정렬
+#   - GQA 4:1 (32H:8KV) — 추론 효율 + KV cache 절약
+#   - head_dim=80 (2560/32) — Flash Attention 효율적
+#
+# [데이터/학습 설계]
+#   - 데이터: korean_train.bin 8.91B tokens
+#   - Chinchilla 최적: 2.4B × 20 = 48B tokens
+#   - 실제 목표: 60B tokens (6.7 에포크) — 한국어 단일 언어 특성상 추가 학습 유리
+#   - max_steps 57000 = 60B tokens / 1,048,576 tok/step
+#
+# [GPU 메모리 예측 — 8× B200 183GB]
+#   - 모델 FP8: 2.4 GB
+#   - Optimizer (bf16 master + fp32 mom/var): 23.9 GB
+#   - Gradient (bf16): 4.8 GB
+#   - Activation (per GPU, bs=8): ~27 GB
+#   - 합계: ~58 GB/GPU (31.7% 활용) → 여유 충분
+#
+# 실행: bash scripts/launch_korean_3b.sh
+# 테스트: RUN_NAME=korean_3b_test bash scripts/launch_korean_3b.sh --max_steps 50
+model:
+  vocab_size: 64000
+  d_model: 2560
+  n_layers: 32
+  n_heads: 32
+  n_kv_heads: 8           # GQA 4:1 (K/V 파라미터 75% 절감)
+  d_ffn: 6912             # 2.7×d_model, 16배수 (FP8 alignment)
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true            # TransformerEngine MXFP8BlockScaling (B200 네이티브)
+train:
+  # 57k steps × 1,048,576 tok/step = 59.8B tokens ≈ 6.7 에포크
+  max_steps: 57000
+  batch_size: 4            # per GPU: 4 × 4096 = 16,384 토큰 | VRAM ~130 GB (183GB의 71%)
+  grad_accum_steps: 8      # eff_batch: 4 × 8GPU × 8 × 4096 = 1,048,576 tok/step
+  lr: 1.5e-4               # 3B 규모: GPT-3 scaling 기준 1B(2e-4) → 3B(1.5e-4)
+  weight_decay: 0.1
+  warmup_steps: 2000       # 57k steps의 3.5% — 안정적 warmup
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 1000      # 57k steps 기준 ~57 체크포인트
+  eval_interval: 500       # val loss 모니터링
+  use_amp: false           # fp8_autocast가 대체
+  compile_model: false     # TE 2.10 + DDP graph break 위험
+  fp8_amax_history_len: 16
+  fp8_amax_compute_algo: "max"
+  fp8_format: "MXFP8"     # B200 Blackwell 네이티브 블록 스케일링
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/clickhouse-config.xml ADDED Viewed

	@@ -0,0 +1,315 @@

+<?xml version="1.0"?>
+<!--
+  ClickHouse Server Configuration
+  =================================
+  Hardware: AMD EPYC 9365 36-Core (72 threads), 2.2 TB RAM, 2x NUMA nodes
+  Purpose : Data analytics for LLM training pipeline + factory sensor data (CRM project)
+  Generated: 2026-03-01
+-->
+<clickhouse>
+    <!-- =========================================================
+         Network / Listen
+         Listen on localhost only — GPU training node is local only
+    ========================================================= -->
+    <listen_host>127.0.0.1</listen_host>
+    <http_port>8123</http_port>
+    <tcp_port>9000</tcp_port>
+    <interserver_http_port>9009</interserver_http_port>
+    <!-- =========================================================
+         Paths
+         Data on GPFS (20 TB, 18 TB free) for large datasets.
+         Tmp / logs on local /tmp to reduce GPFS small-file pressure.
+    ========================================================= -->
+    <path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/</path>
+    <tmp_path>/tmp/clickhouse-tmp/</tmp_path>
+    <!-- =========================================================
+         Logging
+    ========================================================= -->
+    <logger>
+        <level>information</level>
+        <log>/tmp/clickhouse/logs/clickhouse-server.log</log>
+        <errorlog>/tmp/clickhouse/logs/clickhouse-server.err.log</errorlog>
+        <!-- Rotate at 512 MB, keep 10 files -->
+        <size>536870912</size>
+        <count>10</count>
+    </logger>
+    <!-- =========================================================
+         Memory — server-level cap for all queries combined
+    ========================================================= -->
+    <max_server_memory_usage>536870912000</max_server_memory_usage>
+    <!-- =========================================================
+         CPU / Thread Pools (server-level settings)
+         Physical cores: 36 per socket, 72 total (2 NUMA nodes).
+         Reserve half for GPU training → 36 threads for ClickHouse.
+    ========================================================= -->
+    <background_pool_size>18</background_pool_size>
+    <background_merges_mutations_concurrency_ratio>2</background_merges_mutations_concurrency_ratio>
+    <background_move_pool_size>4</background_move_pool_size>
+    <background_fetches_pool_size>4</background_fetches_pool_size>
+    <background_schedule_pool_size>8</background_schedule_pool_size>
+    <background_common_pool_size>8</background_common_pool_size>
+    <!-- =========================================================
+         Concurrency
+    ========================================================= -->
+    <max_concurrent_queries>100</max_concurrent_queries>
+    <max_waiting_queries>50</max_waiting_queries>
+    <!-- =========================================================
+         Caches
+         mark_cache    : 10 GB  — indexes for MergeTree parts
+         uncompressed  : 20 GB  — decompressed block cache
+         query_cache   : 2  GB  — optional query result cache
+         NUMA note: ClickHouse allocates via jemalloc with NUMA
+           awareness; no extra config needed beyond thread binding.
+    ========================================================= -->
+    <mark_cache_size>10737418240</mark_cache_size>
+    <uncompressed_cache_size>21474836480</uncompressed_cache_size>
+    <!-- Available from ClickHouse 23.x -->
+    <query_cache>
+        <max_size_in_bytes>2147483648</max_size_in_bytes>
+        <max_entries>1024</max_entries>
+        <max_entry_size_in_bytes>104857600</max_entry_size_in_bytes>
+        <max_entry_size_in_rows>30000000</max_entry_size_in_rows>
+    </query_cache>
+    <!-- Compiled expression cache -->
+    <compiled_expression_cache_size>134217728</compiled_expression_cache_size>
+    <compiled_expression_cache_elements_size>10000</compiled_expression_cache_elements_size>
+    <!-- =========================================================
+         I/O
+         GPFS is a parallel filesystem — large sequential reads are
+         efficient; use aggressive read-ahead and prefetch.
+         NVMe local disks can be used for tmp / intermediate data.
+    ========================================================= -->
+    <!-- MergeTree concurrent read settings moved to <profiles><default> -->
+    <!-- Async reads from object/POSIX storage -->
+    <asynchronous_metrics_update_period_s>60</asynchronous_metrics_update_period_s>
+    <!-- Async Insert settings moved to <profiles><default> below -->
+    <!-- =========================================================
+         MergeTree Storage Settings
+    ========================================================= -->
+    <merge_tree>
+        <!-- Bytes: prefer larger parts on GPFS to reduce metadata overhead -->
+        <max_bytes_to_merge_at_max_space_in_pool>161061273600</max_bytes_to_merge_at_max_space_in_pool>
+        <!-- Allow up to 300 parts per partition before slowing inserts -->
+        <parts_to_throw_insert>300</parts_to_throw_insert>
+        <parts_to_delay_insert>150</parts_to_delay_insert>
+        <!-- Use AVX-512 SIMD for sorting / hashing where available -->
+        <use_minimalistic_part_header_in_zookeeper>1</use_minimalistic_part_header_in_zookeeper>
+        <!-- Keep deleted data up to 8 hours before final cleanup -->
+        <old_parts_lifetime>28800</old_parts_lifetime>
+    </merge_tree>
+    <!-- =========================================================
+         Compression
+         LZ4 default (fast, AVX2/AVX-512 accelerated).
+         ZSTD level 3 for cold / archival tables — trade CPU for space.
+    ========================================================= -->
+    <compression>
+        <!-- Hot data: LZ4 -->
+        <case>
+            <min_part_size>1073741824</min_part_size>
+            <min_part_size_ratio>0.01</min_part_size_ratio>
+            <method>lz4</method>
+        </case>
+        <!-- Very large parts: ZSTD for better ratio -->
+        <case>
+            <min_part_size>10737418240</min_part_size>
+            <min_part_size_ratio>0.1</min_part_size_ratio>
+            <method>zstd</method>
+            <level>3</level>
+        </case>
+    </compression>
+    <!-- =========================================================
+         Users / Access Control
+         Single local user, no password (localhost-only listen).
+         See users.xml (or inline below) for quota/profile.
+    ========================================================= -->
+    <users>
+        <default>
+            <password></password>
+            <networks>
+                <ip>127.0.0.1/8</ip>
+                <ip>::1</ip>
+            </networks>
+            <profile>default</profile>
+            <quota>default</quota>
+            <!-- Allow DDL from default user -->
+            <access_management>1</access_management>
+        </default>
+    </users>
+    <profiles>
+        <default>
+            <!-- Memory per query: 500 GB -->
+            <max_memory_usage>536870912000</max_memory_usage>
+            <max_bytes_before_external_group_by>483183820800</max_bytes_before_external_group_by>
+            <max_bytes_before_external_sort>483183820800</max_bytes_before_external_sort>
+            <!-- Threads per query: half of 72 cores -->
+            <max_threads>36</max_threads>
+            <max_concurrent_queries_for_user>100</max_concurrent_queries_for_user>
+            <use_uncompressed_cache>1</use_uncompressed_cache>
+            <!-- O_DIRECT for large scans -->
+            <min_bytes_to_use_direct_io>10737418240</min_bytes_to_use_direct_io>
+            <!-- SIMD JSON parsing (AVX-512) -->
+            <input_format_parallel_parsing>1</input_format_parallel_parsing>
+            <output_format_parallel_formatting>1</output_format_parallel_formatting>
+            <!-- Async insert for sensor/CRM streaming -->
+            <async_insert>1</async_insert>
+            <async_insert_max_data_size>33554432</async_insert_max_data_size>
+            <async_insert_busy_timeout_ms>200</async_insert_busy_timeout_ms>
+            <async_insert_deduplicate>0</async_insert_deduplicate>
+            <wait_for_async_insert>1</wait_for_async_insert>
+            <wait_for_async_insert_timeout>5</wait_for_async_insert_timeout>
+            <!-- MergeTree concurrent read -->
+            <merge_tree_min_rows_for_concurrent_read>20000</merge_tree_min_rows_for_concurrent_read>
+            <merge_tree_min_bytes_for_concurrent_read>24117248</merge_tree_min_bytes_for_concurrent_read>
+        </default>
+    </profiles>
+    <quotas>
+        <default>
+            <interval>
+                <duration>3600</duration>
+                <queries>0</queries>
+                <errors>0</errors>
+                <result_rows>0</result_rows>
+                <read_rows>0</read_rows>
+                <execution_time>0</execution_time>
+            </interval>
+        </default>
+    </quotas>
+    <!-- =========================================================
+         Distributed DDL (single-node — disable ZooKeeper dependency)
+    ========================================================= -->
+    <!-- No ZooKeeper configured; replicated tables use ReplicatedMergeTree
+         only if ZK is added later. Commenting out to avoid startup warnings.
+    <zookeeper>
+        <node>
+            <host>localhost</host>
+            <port>2181</port>
+        </node>
+    </zookeeper>
+    -->
+    <!-- =========================================================
+         Timezone
+    ========================================================= -->
+    <timezone>Asia/Seoul</timezone>
+    <!-- =========================================================
+         Query Log / System Tables
+         Keep 30 days of query history for pipeline debugging.
+    ========================================================= -->
+    <query_log>
+        <database>system</database>
+        <table>query_log</table>
+        <partition_by>toYYYYMM(event_date)</partition_by>
+        <ttl>event_date + INTERVAL 30 DAY</ttl>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+        <max_size_rows>1048576</max_size_rows>
+    </query_log>
+    <query_thread_log>
+        <database>system</database>
+        <table>query_thread_log</table>
+        <partition_by>toYYYYMM(event_date)</partition_by>
+        <ttl>event_date + INTERVAL 7 DAY</ttl>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+    </query_thread_log>
+    <part_log>
+        <database>system</database>
+        <table>part_log</table>
+        <partition_by>toYYYYMM(event_date)</partition_by>
+        <ttl>event_date + INTERVAL 14 DAY</ttl>
+        <flush_interval_milliseconds>5000</flush_interval_milliseconds>
+    </part_log>
+    <trace_log>
+        <database>system</database>
+        <table>trace_log</table>
+        <partition_by>toYYYYMM(event_date)</partition_by>
+        <ttl>event_date + INTERVAL 7 DAY</ttl>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+    </trace_log>
+    <metric_log>
+        <database>system</database>
+        <table>metric_log</table>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+        <collect_interval_milliseconds>1000</collect_interval_milliseconds>
+        <ttl>event_date + INTERVAL 7 DAY</ttl>
+    </metric_log>
+    <asynchronous_metric_log>
+        <database>system</database>
+        <table>asynchronous_metric_log</table>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+        <ttl>event_date + INTERVAL 7 DAY</ttl>
+    </asynchronous_metric_log>
+    <!-- =========================================================
+         Crash Handler
+    ========================================================= -->
+    <core_dump>
+        <size_limit>0</size_limit>
+    </core_dump>
+    <!-- =========================================================
+         Keeper (built-in, single-node mode — replaces ZooKeeper
+         if you want ReplicatedMergeTree without external ZK).
+         Uncomment if needed.
+    ========================================================= -->
+    <!--
+    <keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+        <log_storage_path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/keeper/logs</log_storage_path>
+        <snapshot_storage_path>/PROJECT/0325120031_A/ghong/taketimes/clickhouse-data/keeper/snapshots</snapshot_storage_path>
+        <coordination_settings>
+            <operation_timeout_ms>10000</operation_timeout_ms>
+            <session_timeout_ms>30000</session_timeout_ms>
+            <raft_logs_level>warning</raft_logs_level>
+        </coordination_settings>
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>localhost</hostname>
+                <port>9444</port>
+            </server>
+        </raft_configuration>
+    </keeper_server>
+    -->
+    <!-- =========================================================
+         AVX-512 / SIMD hints
+         ClickHouse auto-detects CPUID at runtime; these flags are
+         informational comments — no XML knobs needed.
+         Detected: avx512f, avx512bw, avx512vl, avx512_vnni, avx512_bf16
+         Used in: LZ4 compression, hash aggregation, sorting, filters.
+    ========================================================= -->
+    <!-- =========================================================
+         Miscellaneous
+    ========================================================= -->
+    <!-- Skip strict settings check for forward-compat -->
+    <skip_check_for_incorrect_settings>1</skip_check_for_incorrect_settings>
+    <!-- Graceful shutdown: wait up to 60 s for running queries -->
+    <shutdown_wait_unfinished>60</shutdown_wait_unfinished>
+    <!-- Send anonymous usage statistics: off for private server -->
+    <send_crash_reports>
+        <enabled>false</enabled>
+    </send_crash_reports>
+</clickhouse>

source/configs/hybrid_3b.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+# FRANKENSTALLM-H 3B: Hybrid Mamba-2 + Transformer
+#
+# [설계 근거 — 2026-03-05]
+#   - 아키텍처: Nemotron-H 8B Dense 참고, 3B 스케일 적용
+#   - 40 layers: 37 Mamba-2 + 3 Attention (layer 13, 26, 39)
+#   - 파라미터: ~2.9B (embedding 포함)
+#   - 데이터: 3b_train.bin (기존 Pure Transformer 동일 데이터)
+#   - lr=2e-4: Mamba-2 논문 참고, Transformer보다 약간 높음
+#   - Attention 3개: 초반(13), 중반(26), 후반(39) 균등 배치
+#   - Mamba 장점: O(n) 시퀀스 처리, 추론 시 constant memory
+#
+# 실행: bash scripts/launch_hybrid_3b.sh
+model:
+  vocab_size: 64000
+  d_model: 3072
+  n_layers: 40
+  n_heads: 24
+  n_kv_heads: 8
+  d_ffn: 9216
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true
+  # Hybrid settings
+  use_hybrid: true
+  hybrid_pattern: "M M M M M M M M M M M M M A M M M M M M M M M M M M A M M M M M M M M M M M M A"
+  mamba_d_state: 128
+  mamba_head_dim: 64
+  mamba_expand: 2
+  mamba_conv_kernel: 4
+  mamba_n_groups: 1
+  mamba_chunk_size: 256
+train:
+  max_steps: 57000
+  batch_size: 4
+  grad_accum_steps: 8
+  lr: 2e-4
+  weight_decay: 0.1
+  warmup_steps: 2000
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 2000
+  eval_interval: 500
+  use_amp: false
+  compile_model: false
+  fp8_amax_history_len: 16
+  fp8_amax_compute_algo: "max"
+  fp8_format: "MXFP8"
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/korean_1b.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Korean LLM 1B parameters — BF16 기본 설정
+# B200 × 8 GPU 최적화, GQA(4:1) + SwiGLU + RoPE(long-context)
+#
+# 아키텍처 계산:
+#   d_ffn = int(2/3 * 4 * 2048) = 5461 → 16배수 올림 = 5472 (FP8 alignment)
+#   실제 파라미터 수 ≈ 12 * 24 * 2048^2 = 1,207,959,552 (~1.2B)
+#
+# 학습 설정:
+#   eff_batch = 4(bs) * 8(GPU) * 8(accum) * 4096(seq) = 1,048,576 토큰/스텝
+#   200,000 스텝 × 1M tok = 200B 토큰 처리
+model:
+  vocab_size: 64000
+  d_model: 2048
+  n_layers: 24
+  n_heads: 16
+  n_kv_heads: 4          # GQA: 4 KV 그룹, 16 쿼리 헤드 (4:1 비율)
+  d_ffn: 5472            # SwiGLU: int(2/3 * 4 * 2048)=5461 → 16배수=5472
+  max_seq_len: 4096
+  rope_theta: 500000.0   # Llama-3 스타일 고주파 외삽 (장문 컨텍스트)
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: false         # BF16 기본; FP8은 korean_1b_fp8.yaml 참조
+train:
+  max_steps: 200000
+  batch_size: 4          # per GPU: 4 × 4096 = 16,384 토큰
+  grad_accum_steps: 8    # eff_batch: 4 × 8GPU × 8 × 4096 = 1,048,576 tok/step
+  lr: 2.0e-4
+  weight_decay: 0.1
+  warmup_steps: 4000
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 1000
+  eval_interval: 500
+  use_amp: true          # BF16 mixed precision
+  compile_model: false
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/korean_1b_fp8.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Korean LLM 1B parameters — FP8 변형 (B200 TransformerEngine 네이티브)
+#
+# [최적화 근거 — 2026-02-25]
+#   - 데이터: korean_train.bin 8.91B tokens
+#   - max_steps 34000 = 4 에포크 (Muennighoff 2023: 4에포크 초과 시 val loss 상승)
+#     * 기존 200k steps = 23.5 에포크 → 오버피팅 위험, compute 낭비
+#   - lr=2e-4: GPT-3 1.3B 기준과 정확히 일치 (변경 없음)
+#   - eff_batch=1.05M: GPT-3 1.3B 기준과 일치 (변경 없음)
+#   - warmup 2000 = 34k의 5.9% (기존 4000 = 11.8%로 과도했음)
+#   - save/eval 간격 단축: 34k steps 기준 더 촘촘한 체크포인트 필요
+#   - compile_model: false (TE 2.10 graph break 위험, 안정성 우선)
+#
+# 실행: bash scripts/launch_korean_1b.sh
+model:
+  vocab_size: 64000
+  d_model: 2048
+  n_layers: 24
+  n_heads: 16
+  n_kv_heads: 4          # GQA 4:1 (K/V 파라미터 75% 절감)
+  d_ffn: 5472            # 16배수 (FP8 alignment 충족)
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true          # TransformerEngine MXFP8BlockScaling (B200 네이티브)
+train:
+  # 34k steps × 1,048,576 tok/step = 35.6B tokens = 4 에포크 (8.91B 데이터 기준)
+  max_steps: 34000
+  batch_size: 8          # per GPU: 8 × 4096 = 32,768 토큰 | VRAM 30.8% 사용 (192GB)
+  grad_accum_steps: 4    # eff_batch: 8 × 8GPU × 4 × 4096 = 1,048,576 tok/step
+  lr: 2.0e-4             # GPT-3 1.3B 기준 최적값과 정확히 일치
+  weight_decay: 0.1
+  warmup_steps: 2000     # 34k steps의 5.9% — 기존 4000은 11.8%로 과도
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 500     # 34k steps 기준 ~70 체크포인트 (기존 1000은 너무 듬성)
+  eval_interval: 200     # val loss 조기 이상 감지용
+  use_amp: false         # fp8_autocast가 대체 (torch.autocast 불필요)
+  compile_model: false   # TE 2.10 + DDP graph break 위험
+  fp8_amax_history_len: 16
+  fp8_amax_compute_algo: "max"
+  fp8_format: "MXFP8"   # B200 Blackwell 네이티브 블록 스케일링
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/korean_1b_sft.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Korean LLM 1B — SFT (Supervised Fine-Tuning) 설정
+#
+# Base model: korean_1b_fp8_run1/checkpoint-0034000 (1.19B params, 34k pretrain steps)
+# SFT 목표: instruction following + 반복 퇴화 완화 + 생성 품질 향상
+#
+# 실행: bash scripts/launch_sft.sh
+model:
+  vocab_size: 64000
+  d_model: 2048
+  n_layers: 24
+  n_heads: 16
+  n_kv_heads: 4
+  d_ffn: 5472
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true
+train:
+  max_steps: 5000          # SFT: 수천 steps면 충분 (pretrain 34k 대비 ~10%)
+  batch_size: 4            # per GPU (SFT는 seq가 다양하므로 작게)
+  grad_accum_steps: 2      # eff_batch: 4 × 8GPU × 2 × 4096 = 262,144 tok/step
+  lr: 2.0e-5               # pretrain의 1/10 (catastrophic forgetting 방지)
+  weight_decay: 0.01       # pretrain 0.1보다 약하게
+  warmup_steps: 150        # 3000 steps의 3.3%
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 500
+  eval_interval: 100
+  use_amp: false           # FP8 사용 시 불필요
+  compile_model: false
+  fp8_amax_history_len: 16
+  fp8_amax_compute_algo: "max"
+  fp8_format: "MXFP8"
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/korean_3b_fp8.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Korean LLM 3B parameters — FP8 (B200 TransformerEngine MXFP8)
+#
+# [설계 근거 — 2026-02-27]
+#   - 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
+#   - 파라미터: ~3.0B (embedding 포함)
+#   - 데이터: korean_train.bin 8.91B tokens → 최소 60B tokens (7 에포크)
+#   - Chinchilla optimal: 3B 모델 → 60B tokens, 실용적으로 100B 권장
+#   - lr=1.5e-4: LLaMA-3 3B 기준 (1B의 2e-4 대비 낮춤, μP scaling ~1/sqrt(3))
+#   - eff_batch=2M tokens: 3B 기준 GPT-3 scaling law 참고
+#   - 체크포인트: ~27GB/개, 2000 step 간격 → 최대 ~30개 = 810GB
+#   - 예상 학습 시간: 8×B200 FP8 기준 ~72-96시간 (60B tokens)
+#
+# 실행: bash scripts/launch_3b_pretrain.sh
+model:
+  vocab_size: 64000
+  d_model: 3072
+  n_layers: 28
+  n_heads: 24
+  n_kv_heads: 8          # GQA 3:1 (메모리 효율 + 품질 밸런스)
+  d_ffn: 8192            # ~2.67× d_model, 128배수 (FP8 alignment)
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true
+train:
+  # Phase 1: 60B tokens (최소) = 57000 steps × 2^20 tok/step
+  # Phase 2: 100B tokens (권장) = 95000 steps
+  max_steps: 57000
+  batch_size: 5          # per GPU: 5 × 4096 = 20,480 토큰 (QKV fusion 후 ~161GB/183GB VRAM, 21GB 여유)
+  grad_accum_steps: 8    # eff_batch: 5 × 8GPU × 8 × 4096 = 1,310,720 tok/step (~1.3M)
+  lr: 1.5e-4             # LLaMA-3 3B 스케일, Chinchilla 참고
+  weight_decay: 0.1
+  warmup_steps: 2000     # 57k의 3.5%
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 2000    # 27GB/체크포인트 → 2000 step 간격 = ~28개 = 756GB
+  eval_interval: 500
+  use_amp: false
+  compile_model: false
+  fp8_amax_history_len: 16   # NOTE: MXFP8 format에서는 무시됨 (DelayedScaling 전용)
+  fp8_amax_compute_algo: "max"  # NOTE: MXFP8 format에서는 무시됨
+  fp8_format: "MXFP8"
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/korean_3b_orpo.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# Korean 3B ORPO Configuration (Phase 3)
+#
+# Base model: SFT v1 best checkpoint (HF format)
+# 목표: Greedy 반복률 73%→30% 이하, EOS 종료율 0%→80%+
+#
+# 실행:
+#   bash scripts/launch_3b_orpo.sh                  # 본 학습
+#   bash scripts/launch_3b_orpo.sh --max_steps 200  # 퀵 테스트
+#
+# [설계 근거]
+#   - beta=0.25: 반복률 73%는 극단적 → 강한 OR loss 필요 (기존 0.1에서 상향)
+#   - lr=1.2e-5: HP sweep 6-config 결과 최적 (eval_loss 1.862, margin +0.009)
+#   - epochs=2: 683K 규모에 3 epoch은 과적합 위험
+#   - max_length=1536: P95=880 tokens, 99%+ 커버 + VRAM 25% 절약
+# Model
+model_path: eval/outputs/hf_3b_sft_best
+output_dir: checkpoints/korean_3b_orpo
+# Training
+epochs: 2
+lr: 1.2e-5
+beta: 0.25
+batch_size: 4
+gradient_accumulation_steps: 4
+max_length: 1536
+bf16: true
+weight_decay: 0.01
+seed: 42
+# Scheduler
+lr_scheduler_type: cosine
+warmup_ratio: 0.05
+# Evaluation & Early Stopping
+eval_split_ratio: 0.05
+eval_steps: 500
+early_stopping_patience: 3
+# Logging & checkpointing
+logging_steps: 10
+save_steps: 500
+save_total_limit: 5
+gradient_checkpointing: true
+report_to: none
+# Data
+custom_data_path: data/preference/combined_preference.jsonl
+dataset_num_proc: 64

source/configs/korean_3b_sft.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+# Korean 3B SFT Configuration
+#
+# Base model: checkpoints/korean_3b_fp8_run1/checkpoint-XXXXXX (3B params pretrained)
+# SFT 목표: instruction following + 반복 퇴화 완화 + 생성 품질 향상
+# 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
+#
+# 실행: bash scripts/launch_3b_sft.sh
+#
+# [설계 근거 — 2026-03-02]
+#   - batch: 2 × 8GPU × 4 grad_accum = 64 eff_batch
+#   - max_steps 33000 ≈ 3 epochs × 700K samples / 64 eff_batch
+#   - lr=1e-5: pretrain 1.5e-4의 1/15 (catastrophic forgetting 방지)
+#   - NEFTune alpha=5.0: 생성 다양성 향상, 반복 퇴화 완화
+#   - use_fp8=true: B200 MXFP8 네이티브 가속 유지
+model:
+  vocab_size: 64000
+  d_model: 3072
+  n_layers: 28
+  n_heads: 24
+  n_kv_heads: 8
+  d_ffn: 8192
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true
+train:
+  max_steps: 33000       # 3 epochs × 700K / 64 eff_batch
+  batch_size: 2          # per GPU (3B VRAM 절약)
+  grad_accum_steps: 4    # eff_batch: 2 × 8GPU × 4 = 64
+  lr: 1.0e-5             # catastrophic forgetting 방지
+  weight_decay: 0.01
+  warmup_steps: 500
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 2000
+  eval_interval: 500
+  use_amp: false
+  compile_model: false
+  neftune_alpha: 5.0     # NEFTune noise injection
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/korean_3b_sft_v2.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Korean 3B SFT v2 Configuration
+#
+# Base model: checkpoints/korean_3b_fp8_run1/checkpoint-0057000 (3B params pretrained)
+# SFT v2 목표: v1의 underfitting 해결 + forgetting 방지 (data mixing)
+# 아키텍처: LLaMA-3 3B 참고 (d=3072, 28L, 24H, GQA 8:1)
+#
+# 실행: bash scripts/launch_3b_sft_v2.sh
+#
+# [설계 근거 — SFT v1 실패 분석 2026-03-06]
+#   v1 문제: lr=1e-5 → val_loss 변화 0 (사실상 학습 안 됨)
+#   v2 변경:
+#     - lr: 1e-5 → 5e-5 (5배 ↑, 3B SFT 표준 범위)
+#     - batch: 4 × 8GPU × 8 grad_accum = 256 eff_batch (v1 대비 4배 ↑)
+#     - warmup: 500 → 2000 (높은 LR에 맞춰 안정화)
+#     - max_steps: 33000 → 15000 (수렴 빨라짐, 과적합 방지)
+#     - weight_decay: 0.01 → 0.05 (forgetting 억제)
+#     - data mixing: SFT 70% + pretrain 30% (forgetting 방지)
+model:
+  vocab_size: 64000
+  d_model: 3072
+  n_layers: 28
+  n_heads: 24
+  n_kv_heads: 8
+  d_ffn: 8192
+  max_seq_len: 4096
+  rope_theta: 500000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true
+train:
+  max_steps: 15000       # v1 33000 → 15000 (수렴 빨라짐)
+  batch_size: 4          # v1 2 → 4 (VRAM 여유 충분: 48/183GB)
+  grad_accum_steps: 8    # v1 4 → 8 (eff_batch: 4 × 8GPU × 8 = 256)
+  lr: 5.0e-5             # v1 1e-5 → 5e-5 (5배 ↑, underfitting 해결)
+  weight_decay: 0.05     # v1 0.01 → 0.05 (forgetting 억제)
+  warmup_steps: 2000     # v1 500 → 2000 (높은 LR 안정화)
+  max_grad_norm: 1.0     # gradient clipping
+  log_interval: 10
+  save_interval: 2000
+  eval_interval: 500
+  use_amp: false
+  compile_model: false
+  neftune_alpha: 5.0     # NEFTune noise injection (유지)
+  # Data mixing (forgetting 방지)
+  pretrain_mix_ratio: 0.3        # pretrain 데이터 30% 혼합
+  pretrain_data: data/3b_train.bin  # pretrain 데이터 경로
+tokenizer:
+  vocab_size: 64000
+  type: sentencepiece_unigram

source/configs/medium.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# Medium LLM ~350M parameters (GPT-2 medium equivalent)
+model:
+  vocab_size: 32000
+  d_model: 1024
+  n_layers: 24
+  n_heads: 16
+  n_kv_heads: 8       # GQA: 2 KV heads per Q group
+  max_seq_len: 4096
+  rope_theta: 500000.0  # extended RoPE for longer context
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+train:
+  max_steps: 200000
+  batch_size: 4
+  grad_accum_steps: 8   # effective batch = 4 * 8 GPUs * 8 = 256
+  lr: 2.0e-4
+  weight_decay: 0.1
+  warmup_steps: 4000
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 1000
+  eval_interval: 500
+  use_amp: true
+  compile_model: false
+tokenizer:
+  vocab_size: 32000
+  type: bpe

source/configs/small.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# Small LLM ~125M parameters (GPT-2 small equivalent)
+model:
+  vocab_size: 32000
+  d_model: 768
+  n_layers: 12
+  n_heads: 12
+  n_kv_heads: 12      # MHA (same as n_heads)
+  max_seq_len: 2048
+  rope_theta: 10000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+train:
+  max_steps: 100000
+  batch_size: 8          # per GPU
+  grad_accum_steps: 4    # effective batch = 8 * 8 GPUs * 4 = 256
+  lr: 3.0e-4
+  weight_decay: 0.1
+  warmup_steps: 2000
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 1000
+  eval_interval: 500
+  use_amp: true
+  compile_model: false
+tokenizer:
+  vocab_size: 32000
+  type: bpe

source/configs/small_fp8.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Small LLM ~125M parameters — FP8 variant (B200 TransformerEngine)
+# Based on small.yaml; only changed fields are listed explicitly.
+model:
+  vocab_size: 32000
+  d_model: 768
+  n_layers: 12
+  n_heads: 12
+  n_kv_heads: 12      # MHA (same as n_heads)
+  max_seq_len: 2048
+  rope_theta: 10000.0
+  dropout: 0.0
+  bias: false
+  use_flash_attn: true
+  use_fp8: true        # Enable TransformerEngine FP8 kernels
+train:
+  max_steps: 100000
+  batch_size: 8          # per GPU; 8 * 2048 = 16384 tokens → divisible by 8 ✓
+  grad_accum_steps: 4    # effective batch = 8 * 8 GPUs * 4 = 256
+  lr: 3.0e-4
+  weight_decay: 0.1
+  warmup_steps: 2000
+  max_grad_norm: 1.0
+  log_interval: 10
+  save_interval: 1000
+  eval_interval: 500
+  use_amp: false          # fp8_autocast replaces torch.autocast
+  compile_model: false    # torch.compile + TE 2.10 stability not verified
+  fp8_amax_history_len: 16
+  fp8_amax_compute_algo: "max"
+  fp8_format: "MXFP8"    # B200 native block scaling (better than HYBRID on Blackwell)
+tokenizer:
+  vocab_size: 32000
+  type: bpe