Vjeong Claude Sonnet 4.6 commited on Feb 27

Commit

858e8b2

1 Parent(s): 2367a60

docs: translate all Korean comments and docstrings to English

Convert all Korean-language comments, docstrings, and inline annotations
to English across 38 source files and CLAUDE.md. Update code conventions
to require English for all code, comments, docstrings, and git commit messages.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (39) hide show

CLAUDE.md +56 -56
llm_lab/__init__.py +9 -9
llm_lab/config/__init__.py +1 -1
llm_lab/config/data_config.py +21 -21
llm_lab/config/eval_config.py +5 -5
llm_lab/config/model_config.py +19 -19
llm_lab/config/train_config.py +49 -49
llm_lab/data/__init__.py +1 -1
llm_lab/data/dataset.py +60 -59
llm_lab/data/diagnostics.py +42 -42
llm_lab/data/pipeline.py +35 -35
llm_lab/data/tokenizer.py +53 -53
llm_lab/evaluation/__init__.py +1 -1
llm_lab/evaluation/attention_viz.py +23 -23
llm_lab/evaluation/checklist.py +36 -36
llm_lab/evaluation/dynamics.py +38 -38
llm_lab/evaluation/full_evaluator.py +48 -48
llm_lab/evaluation/generation.py +49 -49
llm_lab/evaluation/perplexity.py +34 -34
llm_lab/evaluation/runner.py +5 -5
llm_lab/evaluation/scaling.py +28 -28
llm_lab/model/__init__.py +1 -1
llm_lab/model/attention.py +27 -27
llm_lab/model/feedforward.py +17 -17
llm_lab/model/llm_model.py +49 -49
llm_lab/model/norm.py +15 -15
llm_lab/model/rope.py +31 -29
llm_lab/model/transformer_block.py +12 -12
llm_lab/model/utils.py +11 -11
llm_lab/training/__init__.py +1 -1
llm_lab/training/checkpoint.py +40 -40
llm_lab/training/metrics.py +19 -19
llm_lab/training/optimizer.py +15 -15
llm_lab/training/runner.py +14 -14
llm_lab/training/scheduler.py +18 -18
llm_lab/training/trainer.py +68 -67
llm_lab/utils/__init__.py +1 -1
llm_lab/utils/device.py +20 -20
llm_lab/utils/seed.py +2 -2

CLAUDE.md CHANGED Viewed

@@ -1,78 +1,78 @@
 # LLM-1B-Lab
-1.1B parameter LLaMA-style Decoder-Only Transformer 교육용 구현.
-딥러닝 초보자가 처음부터 끝까지 LLM을 학습하고 평가하는 과정을 경험할 수 있도록 설계됨.
-## 프로젝트 구조
 ```
 LLM_Foundation_Model/
 ├── CLAUDE.md
 ├── requirements.txt
-├── llm_lab/                          # Python 패키지 (핵심 코드)
 │   ├── __init__.py
-│   ├── config/                       # 설정 데이터클래스
-│   │   ├── model_config.py           # ModelConfig (debug_10m / small_100m / base_1b 프리셋)
-│   │   ├── data_config.py            # DataConfig (데이터셋, 토크나이저, 배치 설정)
-│   │   ├── train_config.py           # TrainConfig (LR, 스케줄러, 체크포인트, wandb)
-│   │   └── eval_config.py            # EvalConfig (평가 파라미터)
-│   ├── model/                        # 모델 아키텍처
 │   │   ├── norm.py                   # RMSNorm
 │   │   ├── rope.py                   # RotaryPositionalEmbedding (RoPE)
 │   │   ├── attention.py              # GroupedQueryAttention (GQA)
 │   │   ├── feedforward.py            # SwiGLUFeedForward
 │   │   ├── transformer_block.py      # TransformerBlock (Pre-LN)
-│   │   ├── llm_model.py             # LLMModel (전체 모델 + generate)
 │   │   └── utils.py                  # count_parameters_detailed, estimate_memory_gb
-│   ├── data/                         # 데이터 파이프라인
 │   │   ├── tokenizer.py              # Tokenizer (SentencePiece / BPE / HuggingFace)
 │   │   ├── dataset.py                # PackedStreamingDataset, ValidationDataset, _collate_fn
 │   │   ├── pipeline.py               # create_train_dataloader, setup_data_pipeline
 │   │   └── diagnostics.py            # DataPipelineDiagnostics
-│   ├── training/                     # 학습 루프
 │   │   ├── scheduler.py              # CosineWarmupScheduler
-│   │   ├── checkpoint.py             # CheckpointManager (Google Drive 지원)
-│   │   ├── metrics.py                # MetricsTracker (wandb 연동)
-│   │   ├── optimizer.py              # create_optimizer (weight decay 분리)
 │   │   ├── trainer.py                # Trainer (gradient accumulation, mixed precision)
-│   │   └── runner.py                 # start_training (한 줄 실행 헬퍼)
-│   ├── evaluation/                   # 평가 & 분석
-│   │   ├── perplexity.py             # PerplexityEvaluator (위치별 Loss 포함)
-│   │   ├── generation.py             # GenerationEvaluator (다양한 프롬프트)
 │   │   ├── scaling.py                # ScalingAnalyzer (Chinchilla Scaling Law)
-│   │   ├── dynamics.py               # TrainingDynamicsAnalyzer (Loss/LR/Grad 시각화)
-│   │   ├── attention_viz.py          # AttentionVisualizer (헤드별 heatmap)
-│   │   ├── full_evaluator.py         # FullEvaluator (종합 평가 + 리포트)
-│   │   ├── checklist.py              # InsightChecklist (학습 인사이트 체크리스트)
-│   │   └── runner.py                 # run_evaluation (한 줄 실행 헬퍼)
-│   └── utils/                        # 공통 유틸리티
 │       ├── device.py                 # auto_configure, get_device, detect_gpu_info
 │       └── seed.py                   # set_seed
-├── notebooks/                        # Jupyter 노트북 (설정 + 실행)
 │   ├── 01_data_pipeline.ipynb
 │   ├── 02_model.ipynb
 │   ├── 03_training.ipynb
 │   └── 04_evaluation.ipynb
-└── _archive/                         # 원본 단일파일 백업
     ├── llm-1b-model.py
     ├── llm-1b-data-pipeline.py
     ├── llm-1b-trainer.py
     └── llm-1b-evaluation.py
 ```
-## 기술 스택
-- **모델**: LLaMA-style Decoder-Only Transformer (RMSNorm, RoPE, GQA, SwiGLU, Weight Tying)
-- **학습**: Gradient Accumulation, Mixed Precision (bf16/fp16), Cosine LR + Warmup, Activation Checkpointing
-- **데이터**: HuggingFace Streaming (FineWeb-Edu), BPE 토크나이저, 시퀀스 패킹
-- **체크포인트**: Google Drive 자동 저장/복원 (Colab Pro+ 환경)
-- **평가**: Perplexity, 텍스트 생성, Scaling Law, Attention 시각화
-- **타겟 환경**: Google Colab Pro+ (A100 40GB)
-## 의존성 그래프 (순환 없음)
 ```
-config (의존성 없음)
   ↓
 utils → config
   ↓
@@ -85,13 +85,13 @@ training → config, utils
 evaluation → config
 ```
-## 모델 프리셋
-| 프리셋 | 파라미터 | dim | layers | heads | kv_heads | 용도 |
-|--------|---------|-----|--------|-------|----------|------|
-| `debug_10m` | ~10M | 256 | 6 | 8 | 4 | 빠른 검증/디버그 |
-| `small_100m` | ~100M | 768 | 12 | 12 | 4 | 중간 실험 |
-| `base_1b` | ~1.1B | 2048 | 22 | 32 | 8 | 본격 학습 |
 ## Quick Start
@@ -102,30 +102,30 @@ from llm_lab.data import setup_data_pipeline
 from llm_lab.training import start_training
 from llm_lab.evaluation import run_evaluation
-# 1. 모델
 model = LLMModel(ModelConfig.base_1b())
-# 2. 데이터
 tok, train_dl, val_dl = setup_data_pipeline("pretrained")
-# 3. 학습
 trainer = start_training(model, train_dl, val_dl)
-# 4. 평가
 report = run_evaluation(model, tok, val_dl,
                         metrics_history=trainer.metrics.history)
 ```
-## 코드 컨벤션
-- **언어**: 코드는 영어, 주석/독스트링은 한국어 (교육적 설명 포함)
-- **타입 힌트**: 모든 함수에 typing 어노테이션 사용
-- **import 순서**: stdlib → torch → llm_lab (절대 경로) → 로컬 (상대 경로)
-- **데이터클래스**: 모든 설정은 `@dataclass`로 정의, 기본값 포함
-- **에러 처리**: 외부 의존성(matplotlib, wandb 등)은 `try/except ImportError`로 선택적 사용
-## 주의사항
-- `torch`는 로컬 환경에 설치되어 있지 않을 수 있음 (Colab Pro+에서 실행 전제)
 - `pip install torch datasets tokenizers sentencepiece transformers wandb matplotlib numpy`
-- 원본 4개 파일(`_archive/`)과 모듈화된 `llm_lab/` 패키지의 로직은 동일 (import 경로만 변경)

 # LLM-1B-Lab
+Educational implementation of a 1.1B parameter LLaMA-style Decoder-Only Transformer.
+Designed so beginners in deep learning can experience training and evaluating an LLM from scratch.
+## Project Structure
 ```
 LLM_Foundation_Model/
 ├── CLAUDE.md
 ├── requirements.txt
+├── llm_lab/                          # Python package (core code)
 │   ├── __init__.py
+│   ├── config/                       # Configuration dataclasses
+│   │   ├── model_config.py           # ModelConfig (debug_10m / small_100m / base_1b presets)
+│   │   ├── data_config.py            # DataConfig (dataset, tokenizer, batch settings)
+│   │   ├── train_config.py           # TrainConfig (LR, scheduler, checkpoint, wandb)
+│   │   └── eval_config.py            # EvalConfig (evaluation parameters)
+│   ├── model/                        # Model architecture
 │   │   ├── norm.py                   # RMSNorm
 │   │   ├── rope.py                   # RotaryPositionalEmbedding (RoPE)
 │   │   ├── attention.py              # GroupedQueryAttention (GQA)
 │   │   ├── feedforward.py            # SwiGLUFeedForward
 │   │   ├── transformer_block.py      # TransformerBlock (Pre-LN)
+│   │   ├── llm_model.py             # LLMModel (full model + generate)
 │   │   └── utils.py                  # count_parameters_detailed, estimate_memory_gb
+│   ├── data/                         # Data pipeline
 │   │   ├── tokenizer.py              # Tokenizer (SentencePiece / BPE / HuggingFace)
 │   │   ├── dataset.py                # PackedStreamingDataset, ValidationDataset, _collate_fn
 │   │   ├── pipeline.py               # create_train_dataloader, setup_data_pipeline
 │   │   └── diagnostics.py            # DataPipelineDiagnostics
+│   ├── training/                     # Training loop
 │   │   ├── scheduler.py              # CosineWarmupScheduler
+│   │   ├── checkpoint.py             # CheckpointManager (Google Drive support)
+│   │   ├── metrics.py                # MetricsTracker (wandb integration)
+│   │   ├── optimizer.py              # create_optimizer (weight decay separation)
 │   │   ├── trainer.py                # Trainer (gradient accumulation, mixed precision)
+│   │   └── runner.py                 # start_training (one-line helper)
+│   ├── evaluation/                   # Evaluation & analysis
+│   │   ├── perplexity.py             # PerplexityEvaluator (including per-position loss)
+│   │   ├── generation.py             # GenerationEvaluator (various prompts)
 │   │   ├── scaling.py                # ScalingAnalyzer (Chinchilla Scaling Law)
+│   │   ├── dynamics.py               # TrainingDynamicsAnalyzer (Loss/LR/Grad visualization)
+│   │   ├── attention_viz.py          # AttentionVisualizer (per-head heatmap)
+│   │   ├── full_evaluator.py         # FullEvaluator (comprehensive evaluation + report)
+│   │   ├── checklist.py              # InsightChecklist (training insight checklist)
+│   │   └── runner.py                 # run_evaluation (one-line helper)
+│   └── utils/                        # Common utilities
 │       ├── device.py                 # auto_configure, get_device, detect_gpu_info
 │       └── seed.py                   # set_seed
+├── notebooks/                        # Jupyter notebooks (configuration + execution)
 │   ├── 01_data_pipeline.ipynb
 │   ├── 02_model.ipynb
 │   ├── 03_training.ipynb
 │   └── 04_evaluation.ipynb
+└── _archive/                         # Original single-file backups
     ├── llm-1b-model.py
     ├── llm-1b-data-pipeline.py
     ├── llm-1b-trainer.py
     └── llm-1b-evaluation.py
 ```
+## Tech Stack
+- **Model**: LLaMA-style Decoder-Only Transformer (RMSNorm, RoPE, GQA, SwiGLU, Weight Tying)
+- **Training**: Gradient Accumulation, Mixed Precision (bf16/fp16), Cosine LR + Warmup, Activation Checkpointing
+- **Data**: HuggingFace Streaming (FineWeb-Edu), BPE tokenizer, sequence packing
+- **Checkpoint**: Auto save/restore to Google Drive (Colab Pro+ environment)
+- **Evaluation**: Perplexity, text generation, Scaling Law, Attention visualization
+- **Target Environment**: Google Colab Pro+ (A100 40GB)
+## Dependency Graph (no cycles)
 ```
+config (no dependencies)
   ↓
 utils → config
   ↓
 evaluation → config
 ```
+## Model Presets
+| Preset | Parameters | dim | layers | heads | kv_heads | Purpose |
+|--------|-----------|-----|--------|-------|----------|---------|
+| `debug_10m` | ~10M | 256 | 6 | 8 | 4 | Fast validation/debug |
+| `small_100m` | ~100M | 768 | 12 | 12 | 4 | Intermediate experiments |
+| `base_1b` | ~1.1B | 2048 | 22 | 32 | 8 | Full-scale training |
 ## Quick Start
 from llm_lab.training import start_training
 from llm_lab.evaluation import run_evaluation
+# 1. Model
 model = LLMModel(ModelConfig.base_1b())
+# 2. Data
 tok, train_dl, val_dl = setup_data_pipeline("pretrained")
+# 3. Training
 trainer = start_training(model, train_dl, val_dl)
+# 4. Evaluation
 report = run_evaluation(model, tok, val_dl,
                         metrics_history=trainer.metrics.history)
 ```
+## Code Conventions
+- **Language**: All code, comments, docstrings, and git commit messages must be written in English
+- **Type hints**: Use typing annotations on all functions
+- **Import order**: stdlib → torch → llm_lab (absolute) → local (relative)
+- **Dataclasses**: All configurations defined as `@dataclass` with defaults
+- **Error handling**: Optional dependencies (matplotlib, wandb, etc.) wrapped in `try/except ImportError`
+## Notes
+- `torch` may not be installed locally (assumes Colab Pro+ runtime)
 - `pip install torch datasets tokenizers sentencepiece transformers wandb matplotlib numpy`
+- The logic in the original 4 files (`_archive/`) and the modularized `llm_lab/` package is identical (only import paths changed)

llm_lab/__init__.py CHANGED Viewed

@@ -1,16 +1,16 @@
 """
 LLM-1B-Lab: 1B Parameter LLaMA-style Transformer (from scratch)
 ================================================================
-딥러닝 초보자를 위한 학습용 구현.
-각 컴포넌트에 상세 주석을 달아 "왜 이렇게 하는지"를 설명합니다.
-모듈 구조:
-  llm_lab.config      — 모든 설정 (ModelConfig, DataConfig, TrainConfig, EvalConfig)
-  llm_lab.model       — 모델 아키텍처 (RMSNorm, RoPE, GQA, SwiGLU, Transformer)
-  llm_lab.data        — 데이터 파이프라인 (토크나이저, 스트리밍, 패킹)
-  llm_lab.training    — 학습 루프 (Trainer, 스케줄러, 체크포인트)
-  llm_lab.evaluation  — 평가 (Perplexity, 생성, Scaling Law, Attention)
-  llm_lab.utils       — 공통 유틸리티 (디바이스 감지, 시드)
 Quick Start:
   from llm_lab.config import ModelConfig, DataConfig, TrainConfig

 """
 LLM-1B-Lab: 1B Parameter LLaMA-style Transformer (from scratch)
 ================================================================
+An educational implementation for deep learning beginners.
+Each component includes detailed comments explaining "why" things are done this way.
+Module structure:
+  llm_lab.config      — All configurations (ModelConfig, DataConfig, TrainConfig, EvalConfig)
+  llm_lab.model       — Model architecture (RMSNorm, RoPE, GQA, SwiGLU, Transformer)
+  llm_lab.data        — Data pipeline (tokenizer, streaming, packing)
+  llm_lab.training    — Training loop (Trainer, scheduler, checkpoint)
+  llm_lab.evaluation  — Evaluation (Perplexity, generation, Scaling Law, Attention)
+  llm_lab.utils       — Common utilities (device detection, seed)
 Quick Start:
   from llm_lab.config import ModelConfig, DataConfig, TrainConfig

llm_lab/config/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""설정(Config) 모듈 — 모든 하이퍼파라미터를 한 곳에서 관리합니다."""
 from .model_config import ModelConfig
 from .data_config import DataConfig
 from .train_config import TrainConfig

+"""Config module — manages all hyperparameters in one place."""
 from .model_config import ModelConfig
 from .data_config import DataConfig
 from .train_config import TrainConfig

llm_lab/config/data_config.py CHANGED Viewed

@@ -4,38 +4,38 @@ from typing import Optional
 @dataclass
 class DataConfig:
-    """데이터 파이프라인 설정.
-    Colab Pro+ 환경 제약을 고려한 기본값:
-      - Streaming 모드로 디스크 사용 최소화
-      - 시퀀스 패킹으로 패딩 없이 GPU 활용률 극대화
-      - 전처리를 on-the-fly로 수행하여 메모리 절약
     """
-    # ── 데이터셋 ──
     dataset_name: str = "HuggingFaceFW/fineweb-edu"
-    dataset_subset: str = "sample-10BT"       # 10B 토큰 샘플
     dataset_split: str = "train"
-    text_column: str = "text"                  # 텍스트가 담긴 컬럼명
-    # ── 토크나이저 ──
-    tokenizer_type: str = "sentencepiece"      # "sentencepiece" 또는 "hf"
-    # 사전 학습된 토크나이저 경로 (없으면 새로 학습)
     tokenizer_path: Optional[str] = None
     vocab_size: int = 32_000
-    # ── 시퀀스 ──
     max_seq_len: int = 2048
-    # 문서 구분 토큰 사용 여부 (패킹 시 문서 경계 표시)
     use_eos_separator: bool = True
-    # ── 배치 ──
-    batch_size: int = 4                        # micro batch (GPU당)
-    num_workers: int = 2                       # DataLoader 워커 수
-    prefetch_factor: int = 4                   # 미리 준비할 배치 수
-    # ── 토크나이저 학습 설정 (새로 학습 시) ──
-    tokenizer_train_samples: int = 50_000      # 학습에 사용할 문서 수
     tokenizer_save_dir: str = "./tokenizer"
-    # ── 검증 데이터 ──
-    val_ratio: float = 0.001                   # 전체의 0.1%를 검증용으로

 @dataclass
 class DataConfig:
+    """Data pipeline configuration.
+    Default values optimized for Colab Pro+ environment constraints:
+      - Streaming mode to minimize disk usage
+      - Sequence packing to maximize GPU utilization without padding
+      - On-the-fly preprocessing to save memory
     """
+    # ── Dataset ──
     dataset_name: str = "HuggingFaceFW/fineweb-edu"
+    dataset_subset: str = "sample-10BT"       # 10B token sample
     dataset_split: str = "train"
+    text_column: str = "text"                  # column name containing text
+    # ── Tokenizer ──
+    tokenizer_type: str = "sentencepiece"      # "sentencepiece" or "hf"
+    # path to a pretrained tokenizer (trains a new one if not provided)
     tokenizer_path: Optional[str] = None
     vocab_size: int = 32_000
+    # ── Sequence ──
     max_seq_len: int = 2048
+    # whether to use a document separator token (marks document boundaries during packing)
     use_eos_separator: bool = True
+    # ── Batch ──
+    batch_size: int = 4                        # micro batch (per GPU)
+    num_workers: int = 2                       # number of DataLoader workers
+    prefetch_factor: int = 4                   # number of batches to prefetch
+    # ── Tokenizer training settings (when training from scratch) ──
+    tokenizer_train_samples: int = 50_000      # number of documents to use for training
     tokenizer_save_dir: str = "./tokenizer"
+    # ── Validation data ──
+    val_ratio: float = 0.001                   # use 0.1% of total data for validation

llm_lab/config/eval_config.py CHANGED Viewed

@@ -3,18 +3,18 @@ from dataclasses import dataclass
 @dataclass
 class EvalConfig:
-    """평가 파라미터."""
     # ── Perplexity ──
     eval_batch_size: int = 4
-    max_eval_batches: int = 100      # 최대 평가 배치 수
-    # ── 생성 ──
     max_new_tokens: int = 200
     temperature: float = 0.8
     top_k: int = 50
     top_p: float = 0.9
-    num_samples: int = 3             # 프롬프트당 생성 횟수
-    # ── 출력 ──
     save_dir: str = "./eval_results"
     plot_dpi: int = 150

 @dataclass
 class EvalConfig:
+    """Evaluation parameters."""
     # ── Perplexity ──
     eval_batch_size: int = 4
+    max_eval_batches: int = 100      # maximum number of evaluation batches
+    # ── Generation ──
     max_new_tokens: int = 200
     temperature: float = 0.8
     top_k: int = 50
     top_p: float = 0.9
+    num_samples: int = 3             # number of generations per prompt
+    # ── Output ──
     save_dir: str = "./eval_results"
     plot_dpi: int = 150

llm_lab/config/model_config.py CHANGED Viewed

@@ -3,37 +3,37 @@ from dataclasses import dataclass
 @dataclass
 class ModelConfig:
-    """모델 하이퍼파라미터를 하나의 데이터클래스로 관리합니다.
-    규모별 프리셋:
-      - debug:  ~10M  (파이프라인 검증용)
-      - small:  ~100M (중간 검증용)
-      - base:   ~1.1B (최종 목표)
     """
     vocab_size: int = 32_000
-    hidden_dim: int = 2048        # d_model: 모델의 기본 차원
-    num_layers: int = 22          # Transformer 블록 수
-    num_heads: int = 16           # Query 헤드 수
-    num_kv_heads: int = 4         # Key/Value 헤드 수 (GQA)
-    intermediate_dim: int = 5632  # FFN 중간 차원 (≈ 2.75 × hidden_dim)
-    max_seq_len: int = 2048       # 최대 시퀀스 길이
-    dropout: float = 0.0          # Pretraining에서는 보통 0 사용
-    rope_theta: float = 10000.0   # RoPE 주파수 베이스
     norm_eps: float = 1e-6        # RMSNorm epsilon
     @property
     def head_dim(self) -> int:
-        """각 어텐션 헤드의 차원."""
         return self.hidden_dim // self.num_heads
     @property
     def num_kv_groups(self) -> int:
-        """GQA에서 하나의 KV 헤드가 담당하는 Q 헤드 수."""
         return self.num_heads // self.num_kv_heads
     @classmethod
     def debug_10m(cls) -> "ModelConfig":
-        """~10M 파라미터 - 빠른 디버깅용."""
         return cls(
             hidden_dim=256, num_layers=6, num_heads=8,
             num_kv_heads=4, intermediate_dim=704, max_seq_len=512,
@@ -41,7 +41,7 @@ class ModelConfig:
     @classmethod
     def small_100m(cls) -> "ModelConfig":
-        """~100M 파라미터 - 중간 검증용."""
         return cls(
             hidden_dim=768, num_layers=12, num_heads=12,
             num_kv_heads=4, intermediate_dim=2048, max_seq_len=1024,
@@ -49,5 +49,5 @@ class ModelConfig:
     @classmethod
     def base_1b(cls) -> "ModelConfig":
-        """~1.1B 파라미터 - 최종 학습 목표."""
-        return cls()  # 기본값이 1B 설정

 @dataclass
 class ModelConfig:
+    """Manages model hyperparameters as a single dataclass.
+    Scale-specific presets:
+      - debug:  ~10M  (for pipeline validation)
+      - small:  ~100M (for intermediate validation)
+      - base:   ~1.1B (final target)
     """
     vocab_size: int = 32_000
+    hidden_dim: int = 2048        # d_model: base dimension of the model
+    num_layers: int = 22          # number of Transformer blocks
+    num_heads: int = 16           # number of Query heads
+    num_kv_heads: int = 4         # number of Key/Value heads (GQA)
+    intermediate_dim: int = 5632  # FFN intermediate dimension (≈ 2.75 × hidden_dim)
+    max_seq_len: int = 2048       # maximum sequence length
+    dropout: float = 0.0          # typically 0 during pretraining
+    rope_theta: float = 10000.0   # RoPE frequency base
     norm_eps: float = 1e-6        # RMSNorm epsilon
     @property
     def head_dim(self) -> int:
+        """Dimension of each attention head."""
         return self.hidden_dim // self.num_heads
     @property
     def num_kv_groups(self) -> int:
+        """Number of Q heads per KV head in GQA."""
         return self.num_heads // self.num_kv_heads
     @classmethod
     def debug_10m(cls) -> "ModelConfig":
+        """~10M parameters - for fast debugging."""
         return cls(
             hidden_dim=256, num_layers=6, num_heads=8,
             num_kv_heads=4, intermediate_dim=704, max_seq_len=512,
     @classmethod
     def small_100m(cls) -> "ModelConfig":
+        """~100M parameters - for intermediate validation."""
         return cls(
             hidden_dim=768, num_layers=12, num_heads=12,
             num_kv_heads=4, intermediate_dim=2048, max_seq_len=1024,
     @classmethod
     def base_1b(cls) -> "ModelConfig":
+        """~1.1B parameters - final training target."""
+        return cls()  # defaults are the 1B configuration

llm_lab/config/train_config.py CHANGED Viewed

@@ -6,97 +6,97 @@ import torch
 @dataclass
 class TrainConfig:
-    """학습 하이퍼파라미터 + 인프라 설정.
-    Colab Pro+ (A100 40GB) 기준 최적화된 기본값.
-    모든 값에 '왜 이 값인지' 설명을 포함합니다.
     """
-    # ── 최적화 ──
     learning_rate: float = 3e-4
-    """Peak LR. 1B 모델 기준 3e-4가 표준.
-    GPT-3 논문에서 모델 크기별 최적 LR을 제시:
       125M → 6e-4, 350M → 3e-4, 1.3B → 2e-4
-    우리 모델(1.1B)은 3e-4에서 시작, 불안정하면 2e-4로 하향."""
     min_learning_rate: float = 3e-5
-    """Cosine decay 최저점. 보통 peak의 10%.
-    너무 낮으면 학습 후반 정체, 너무 높으면 수렴 불안정."""
     weight_decay: float = 0.1
-    """AdamW의 L2 정규화. 0.1이 LLM 표준.
-    Embedding과 Bias에는 적용하지 않음 (관례)."""
     beta1: float = 0.9
     beta2: float = 0.95
-    """Adam 모멘텀 계수. β2=0.95는 LLM 학습에서 β2=0.999보다 안정적.
-    큰 배치 + 긴 학습에서 β2가 너무 크면 적응 속도가 느림."""
     adam_eps: float = 1e-8
     grad_clip: float = 1.0
-    """Gradient Clipping: gradient norm이 1.0을 초과하면 스케일링.
-    학습 초반이나 노이즈 데이터에서 발생하는 gradient spike 방지."""
-    # ── 스케줄링 ──
     warmup_steps: int = 2000
-    """Warmup: 처음 2000 스텝 동안 LR을 0 → peak로 선형 증가.
-    왜 필요한가?
-      - 초기 가중치가 랜덤 → 큰 LR은 불안정한 업데이트 유발
-      - 작은 LR로 시작해 모델이 '방향'을 잡게 한 후 본격 학습
-      - 2000은 전체 학습의 ~10%가 적당 (경험적 규칙)."""
     total_steps: int = 20_000
-    """총 학습 스텝 수.
-    10B tokens / (128 batch × 2048 seq_len) ≈ 38,000 이지만,
-    gradient accumulation 포함 effective step 기준 ~20,000."""
-    # ── 배치 ──
     micro_batch_size: int = 4
-    """GPU에 한 번에 올리는 배치 크기.
-    A100 40GB에서 1B 모델 bf16 기준 4가 안전한 상한."""
     gradient_accumulation_steps: int = 32
-    """Gradient 누적 횟수. Effective batch = 4 × 32 = 128.
-    왜 큰 배치가 좋은가?
-      - gradient 추정이 안정적 (노이즈 감소)
-      - LLM 학습은 보통 effective batch 128~512
-      - 메모리 부족 시 이 값을 늘리고 micro_batch를 줄임."""
     # ── Mixed Precision ──
     dtype: str = "bfloat16"
-    """bfloat16: A100에서 지원, fp16보다 수치 안정성 우수.
-    exponent 비트가 fp32와 동일 → overflow/underflow 위험 적음.
-    T4/V100 폴백 시 'float16'으로 변경."""
-    # ── 체크포인트 ──
     checkpoint_dir: str = "/content/drive/MyDrive/llm-1b-lab/checkpoints"
-    """Google Drive 경로. Colab 세션 만료 시에도 보존됨."""
     checkpoint_interval: int = 500
-    """500 스텝마다 체크포인트 저장.
-    A100 기준 ~30분 간격. 너무 잦으면 I/O 오버헤드,
-    너무 드물면 세션 만료 시 손실 큼."""
     max_checkpoints: int = 3
-    """롤링 보관 수. 오래된 것부터 삭제.
-    체크포인트 1개 ≈ 8-10GB → 3개면 ~30GB."""
-    # ── 로깅 ──
     log_interval: int = 10
-    """10 스텝마다 콘솔 + wandb 로깅."""
     eval_interval: int = 500
-    """500 스텝마다 검증 Loss 측정."""
     eval_steps: int = 20
-    """검증 시 사용할 배치 수. 20 × 4 × 2048 ≈ 160K 토큰."""
     # ── wandb ──
     wandb_project: str = "llm-1b-lab"
     wandb_run_name: Optional[str] = None
     use_wandb: bool = True
-    # ── 재현성 ──
     seed: int = 42
     @property
@@ -105,8 +105,8 @@ class TrainConfig:
     @property
     def tokens_per_step(self) -> int:
-        """한 optimizer step당 처리 토큰 수."""
-        # max_seq_len은 외부에서 주입 (ModelConfig 참조)
         return self.effective_batch_size * 2048
     @property

 @dataclass
 class TrainConfig:
+    """Training hyperparameters and infrastructure configuration.
+    Default values optimized for Colab Pro+ (A100 40GB).
+    Each value includes an explanation of why it was chosen.
     """
+    # ── Optimization ──
     learning_rate: float = 3e-4
+    """Peak LR. 3e-4 is the standard for 1B-scale models.
+    The GPT-3 paper reports optimal LRs by model size:
       125M → 6e-4, 350M → 3e-4, 1.3B → 2e-4
+    Our model (1.1B) starts at 3e-4; lower to 2e-4 if unstable."""
     min_learning_rate: float = 3e-5
+    """Minimum point of cosine decay. Typically 10% of peak.
+    Too low causes stagnation in later training; too high causes unstable convergence."""
     weight_decay: float = 0.1
+    """L2 regularization for AdamW. 0.1 is the LLM standard.
+    Not applied to embeddings and biases (by convention)."""
     beta1: float = 0.9
     beta2: float = 0.95
+    """Adam momentum coefficients. β2=0.95 is more stable than β2=0.999 for LLM training.
+    With large batches and long training, a β2 that is too large slows adaptation."""
     adam_eps: float = 1e-8
     grad_clip: float = 1.0
+    """Gradient Clipping: rescales gradients when their norm exceeds 1.0.
+    Prevents gradient spikes that occur during early training or with noisy data."""
+    # ── Scheduling ──
     warmup_steps: int = 2000
+    """Warmup: linearly increases LR from 0 to peak over the first 2000 steps.
+    Why is this necessary?
+      - Initial weights are random → large LR causes unstable updates
+      - Starting with a small LR lets the model find its direction before full training
+      - 2000 is roughly ~10% of total training steps (empirical rule)."""
     total_steps: int = 20_000
+    """Total number of training steps.
+    10B tokens / (128 batch × 2048 seq_len) ≈ 38,000, but
+    ~20,000 effective steps when accounting for gradient accumulation."""
+    # ── Batch ──
     micro_batch_size: int = 4
+    """Batch size loaded onto the GPU at once.
+    4 is a safe upper bound for a 1B model in bf16 on an A100 40GB."""
     gradient_accumulation_steps: int = 32
+    """Number of gradient accumulation steps. Effective batch = 4 × 32 = 128.
+    Why is a large batch beneficial?
+      - More stable gradient estimates (reduced noise)
+      - LLM training typically uses an effective batch of 128–512
+      - When memory is limited, increase this and reduce micro_batch."""
     # ── Mixed Precision ──
     dtype: str = "bfloat16"
+    """bfloat16: supported on A100, numerically more stable than fp16.
+    Uses the same number of exponent bits as fp32 → lower risk of overflow/underflow.
+    Change to 'float16' when falling back to T4/V100."""
+    # ── Checkpointing ──
     checkpoint_dir: str = "/content/drive/MyDrive/llm-1b-lab/checkpoints"
+    """Google Drive path. Preserved even when the Colab session expires."""
     checkpoint_interval: int = 500
+    """Save a checkpoint every 500 steps.
+    Roughly every ~30 minutes on an A100. Too frequent causes I/O overhead;
+    too infrequent risks large losses when the session expires."""
     max_checkpoints: int = 3
+    """Number of rolling checkpoints to retain; oldest are deleted first.
+    One checkpoint ≈ 8–10 GB → 3 checkpoints ≈ ~30 GB."""
+    # ── Logging ──
     log_interval: int = 10
+    """Log to console and wandb every 10 steps."""
     eval_interval: int = 500
+    """Measure validation loss every 500 steps."""
     eval_steps: int = 20
+    """Number of batches to use during validation. 20 × 4 × 2048 ≈ 160K tokens."""
     # ── wandb ──
     wandb_project: str = "llm-1b-lab"
     wandb_run_name: Optional[str] = None
     use_wandb: bool = True
+    # ── Reproducibility ──
     seed: int = 42
     @property
     @property
     def tokens_per_step(self) -> int:
+        """Number of tokens processed per optimizer step."""
+        # max_seq_len is injected externally (see ModelConfig)
         return self.effective_batch_size * 2048
     @property

llm_lab/data/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""데이터 파이프라인 모듈 — 토크나이저, 스트리밍, 시퀀스 패킹."""
 from .tokenizer import Tokenizer
 from .dataset import PackedStreamingDataset, ValidationDataset
 from .pipeline import create_train_dataloader, train_tokenizer_from_dataset, setup_data_pipeline

+"""Data pipeline module — tokenizer, streaming, and sequence packing."""
 from .tokenizer import Tokenizer
 from .dataset import PackedStreamingDataset, ValidationDataset
 from .pipeline import create_train_dataloader, train_tokenizer_from_dataset, setup_data_pipeline

llm_lab/data/dataset.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""스트리밍 데이터셋 — 시퀀스 패킹, 검증 데이터셋."""
 from typing import Iterator, List, Dict, Optional
@@ -10,24 +10,24 @@ from .tokenizer import Tokenizer
 class PackedStreamingDataset(IterableDataset):
-    """Streaming + 시퀀스 패킹 데이터셋.
-    왜 시퀀스 패킹인가?
-      - 일반적 방법: 각 문서를 max_seq_len으로 잘라 패딩 → GPU 낭비
-      - 시퀀스 패킹: 여러 문서를 이어붙여 max_seq_len을 꽉 채움 → 100% 활용
-    동작 방식:
-      문서1 (300 토큰) + 문서2 (1500 토큰) + 문서3 (248 토큰) = 2048 토큰
-      → [문서1][EOS][문서2][EOS][문서3][EOS][...패딩 없이 딱 맞춤]
-    왜 Streaming인가?
-      - FineWeb-Edu 10B 샘플: 압축 상태에서도 수십 GB
-      - Colab 디스크 한계 (~200GB)에서 전체 다운로드 불가
-      - Streaming: 필요한 만큼만 네트워크에서 읽어옴
-    학습 시 주의사항:
-      - 시퀀스 내 문서 경계에 EOS 토큰 삽입으로 모델이 문서 끝을 인식
-      - Cross-Attention 마스크 없이도 EOS가 자연스러운 경계 역할
     """
     def __init__(
@@ -45,15 +45,16 @@ class PackedStreamingDataset(IterableDataset):
         self.max_seq_len = config.max_seq_len
     def _load_dataset(self, num_shards: int = 1, shard_index: int = 0):
-        """HuggingFace 데이터셋을 스트리밍 모드로 로드합니다.
         Args:
-            num_shards: 전체 샤드 수 (= DataLoader num_workers)
-            shard_index: 이 워커가 담당할 샤드 번호 (0 ~ num_shards-1)
-        샤딩 원리:
-            num_shards=4 일 때 스트림을 4등분하여 각 워커가 서로 다른 1/4만 처리.
-            셔플은 샤딩 이후에 적용하므로 워커 간 문서 중복이 없음.
         """
         from datasets import load_dataset
@@ -61,87 +62,87 @@ class PackedStreamingDataset(IterableDataset):
             self.config.dataset_name,
             name=self.config.dataset_subset,
             split=self.config.dataset_split,
-            streaming=True,         # 핵심: 스트리밍 모드
             trust_remote_code=True,
         )
-        # 완전 분할(샤딩): 워커 i는 전체 스트림의 1/num_shards 구간만 처리
-        # 반드시 셔플 전에 적용해야 각 워커가 겹치지 않는 문서 집합을 가짐
         if num_shards > 1:
             ds = ds.shard(num_shards=num_shards, index=shard_index)
-        # 셔플 (스트리밍에서는 버퍼 기반 근사 셔플)
         ds = ds.shuffle(seed=self.seed, buffer_size=10_000)
         return ds
     def _tokenize_and_pack(self, dataset) -> Iterator[Dict[str, torch.Tensor]]:
-        """문서를 토크나이즈하고 시퀀스 패킹합니다.
         Yields:
             {"input_ids": (max_seq_len,), "targets": (max_seq_len,)}
-        targets = input_ids를 한 칸 shift:
             input_ids:  [A, B, C, D, E]
             targets:    [B, C, D, E, F]
-            → 모델은 A를 보고 B를 예측, B를 보고 C를 예측, ...
         """
-        buffer: List[int] = []  # 토큰 버퍼
         for example in dataset:
             text = example[self.config.text_column]
             if not text or not text.strip():
                 continue
-            # 토크나이즈 (특수 토큰 없이)
             token_ids = self.tokenizer.encode(text, add_special_tokens=False)
             if not token_ids:
                 continue
-            # EOS 토큰 추가 (문서 경계 표시)
             if self.config.use_eos_separator:
                 token_ids.append(self.tokenizer.eos_id)
-            # 버퍼에 추가
             buffer.extend(token_ids)
-            # 버퍼가 충분히 차면 시퀀스 생성
-            # +1은 targets 생성을 위해 (input + 다음 토큰)
             while len(buffer) >= self.max_seq_len + 1:
-                # max_seq_len + 1 만큼 꺼냄
                 chunk = buffer[: self.max_seq_len + 1]
                 buffer = buffer[self.max_seq_len + 1 :]
-                # input_ids: 처음 ~ 끝에서 두 번째
                 input_ids = torch.tensor(chunk[:-1], dtype=torch.long)
-                # targets: 두 번째 ~ 끝 (한 칸 shift)
                 targets = torch.tensor(chunk[1:], dtype=torch.long)
                 yield {"input_ids": input_ids, "targets": targets}
     def __iter__(self) -> Iterator[Dict[str, torch.Tensor]]:
-        """DataLoader가 호출하는 이터레이터.
-        멀티 워커 지원 (완전 분할 방식):
-          - 이전: 모든 워커가 동일한 스트림을 읽고 시드만 달리함 → 문서 중복 가능
-          - 개선: ds.shard()로 스트림을 num_workers등분 → 워커 간 문서 중복 없음
-          예시 (num_workers=4, 전체 문서 N개):
-            Worker 0: 문서 0, 4, 8,  12, ...  (N/4개)
-            Worker 1: 문서 1, 5, 9,  13, ...  (N/4개)
-            Worker 2: 문서 2, 6, 10, 14, ...  (N/4개)
-            Worker 3: 문서 3, 7, 11, 15, ...  (N/4개)
         """
         worker_info = torch.utils.data.get_worker_info()
         if worker_info is not None:
-            # 완전 분할: 워커별 샤드 할당 + 독립적인 셔플 시드
             num_shards = worker_info.num_workers
             shard_index = worker_info.id
             worker_seed = self.seed + worker_info.id
         else:
-            # 단일 프로세스: 샤딩 없이 전체 스트림 처리
             num_shards = 1
             shard_index = 0
             worker_seed = self.seed
@@ -153,10 +154,10 @@ class PackedStreamingDataset(IterableDataset):
 class ValidationDataset:
-    """검증용 데이터셋.
-    Streaming 데이터셋에서 일정량을 미리 가져와 메모리에 저장합니다.
-    매 에폭 동일한 데이터로 평가해야 비교가 의미 있기 때문입니다.
     """
     def __init__(
@@ -174,10 +175,10 @@ class ValidationDataset:
         self._prepare(seed)
     def _prepare(self, seed: int):
-        """데이터셋에서 검증 샘플을 미리 추출합니다."""
         from datasets import load_dataset
-        print(f"[Validation] {self.num_samples}개 검증 샘플 준비 중...")
         ds = load_dataset(
             self.config.dataset_name,
@@ -186,7 +187,7 @@ class ValidationDataset:
             streaming=True,
             trust_remote_code=True,
         )
-        # 학습 데이터와 겹치지 않도록 다른 시드, 앞부분 건너뛰기
         ds = ds.shuffle(seed=seed, buffer_size=5_000)
         buffer: List[int] = []
@@ -217,10 +218,10 @@ class ValidationDataset:
                 })
                 count += 1
-        print(f"[Validation] {len(self.samples)}개 샘플 준비 완료")
     def get_dataloader(self, batch_size: int) -> DataLoader:
-        """검증 DataLoader를 반환합니다."""
         return DataLoader(
             self.samples,
             batch_size=batch_size,
@@ -231,10 +232,10 @@ class ValidationDataset:
 def _collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
-    """배치 내 샘플들을 하나의 텐서로 합칩니다.
-    시퀀스 패킹 덕분에 모든 샘플이 동일한 길이(max_seq_len)이므로
-    추가 패딩이 필요 없습니다.
     """
     return {
         "input_ids": torch.stack([s["input_ids"] for s in batch]),

+"""Streaming dataset — sequence packing and validation dataset."""
 from typing import Iterator, List, Dict, Optional
 class PackedStreamingDataset(IterableDataset):
+    """Streaming + sequence packing dataset.
+    Why sequence packing?
+      - Naive approach: truncate each document to max_seq_len with padding → wastes GPU
+      - Sequence packing: concatenate multiple documents to fill max_seq_len → 100% utilization
+    How it works:
+      Doc1 (300 tokens) + Doc2 (1500 tokens) + Doc3 (248 tokens) = 2048 tokens
+      → [Doc1][EOS][Doc2][EOS][Doc3][EOS][... no padding, fits exactly]
+    Why streaming?
+      - FineWeb-Edu 10B samples: tens of GB even when compressed
+      - Full download not feasible on Colab disk limit (~200GB)
+      - Streaming: reads from the network only as much as needed
+    Notes for training:
+      - EOS token inserted at document boundaries so the model recognizes end-of-document
+      - EOS naturally serves as a boundary marker without cross-attention masking
     """
     def __init__(
         self.max_seq_len = config.max_seq_len
     def _load_dataset(self, num_shards: int = 1, shard_index: int = 0):
+        """Loads the HuggingFace dataset in streaming mode.
         Args:
+            num_shards: Total number of shards (= DataLoader num_workers)
+            shard_index: The shard index this worker is responsible for (0 ~ num_shards-1)
+        Sharding principle:
+            With num_shards=4, the stream is split into 4 equal parts so each worker
+            processes a distinct 1/4. Shuffling is applied after sharding so there is
+            no document overlap between workers.
         """
         from datasets import load_dataset
             self.config.dataset_name,
             name=self.config.dataset_subset,
             split=self.config.dataset_split,
+            streaming=True,         # Key: streaming mode
             trust_remote_code=True,
         )
+        # Full partitioning (sharding): worker i processes only 1/num_shards of the stream
+        # Must be applied before shuffling so each worker has a non-overlapping set of documents
         if num_shards > 1:
             ds = ds.shard(num_shards=num_shards, index=shard_index)
+        # Shuffle (approximate buffer-based shuffle in streaming mode)
         ds = ds.shuffle(seed=self.seed, buffer_size=10_000)
         return ds
     def _tokenize_and_pack(self, dataset) -> Iterator[Dict[str, torch.Tensor]]:
+        """Tokenizes documents and packs them into sequences.
         Yields:
             {"input_ids": (max_seq_len,), "targets": (max_seq_len,)}
+        targets = input_ids shifted by one position:
             input_ids:  [A, B, C, D, E]
             targets:    [B, C, D, E, F]
+            → The model sees A and predicts B, sees B and predicts C, ...
         """
+        buffer: List[int] = []  # Token buffer
         for example in dataset:
             text = example[self.config.text_column]
             if not text or not text.strip():
                 continue
+            # Tokenize (without special tokens)
             token_ids = self.tokenizer.encode(text, add_special_tokens=False)
             if not token_ids:
                 continue
+            # Append EOS token (marks document boundary)
             if self.config.use_eos_separator:
                 token_ids.append(self.tokenizer.eos_id)
+            # Add to buffer
             buffer.extend(token_ids)
+            # Generate sequences once the buffer is full enough
+            # +1 is needed to generate targets (input + next token)
             while len(buffer) >= self.max_seq_len + 1:
+                # Extract max_seq_len + 1 tokens
                 chunk = buffer[: self.max_seq_len + 1]
                 buffer = buffer[self.max_seq_len + 1 :]
+                # input_ids: from the first to the second-to-last token
                 input_ids = torch.tensor(chunk[:-1], dtype=torch.long)
+                # targets: from the second to the last token (shifted by one)
                 targets = torch.tensor(chunk[1:], dtype=torch.long)
                 yield {"input_ids": input_ids, "targets": targets}
     def __iter__(self) -> Iterator[Dict[str, torch.Tensor]]:
+        """Iterator called by DataLoader.
+        Multi-worker support (full partitioning approach):
+          - Previous: all workers read the same stream with different seeds → possible document duplication
+          - Improved: ds.shard() splits the stream into num_workers parts → no document overlap between workers
+          Example (num_workers=4, total N documents):
+            Worker 0: docs 0, 4, 8,  12, ...  (N/4 docs)
+            Worker 1: docs 1, 5, 9,  13, ...  (N/4 docs)
+            Worker 2: docs 2, 6, 10, 14, ...  (N/4 docs)
+            Worker 3: docs 3, 7, 11, 15, ...  (N/4 docs)
         """
         worker_info = torch.utils.data.get_worker_info()
         if worker_info is not None:
+            # Full partitioning: assign a shard per worker + independent shuffle seed
             num_shards = worker_info.num_workers
             shard_index = worker_info.id
             worker_seed = self.seed + worker_info.id
         else:
+            # Single process: process the full stream without sharding
             num_shards = 1
             shard_index = 0
             worker_seed = self.seed
 class ValidationDataset:
+    """Validation dataset.
+    Pre-fetches a fixed amount of data from the streaming dataset and stores it in memory.
+    Consistent data across evaluations is necessary for meaningful comparisons between epochs.
     """
     def __init__(
         self._prepare(seed)
     def _prepare(self, seed: int):
+        """Pre-extracts validation samples from the dataset."""
         from datasets import load_dataset
+        print(f"[Validation] Preparing {self.num_samples} validation samples...")
         ds = load_dataset(
             self.config.dataset_name,
             streaming=True,
             trust_remote_code=True,
         )
+        # Use a different seed and skip the beginning to avoid overlap with training data
         ds = ds.shuffle(seed=seed, buffer_size=5_000)
         buffer: List[int] = []
                 })
                 count += 1
+        print(f"[Validation] {len(self.samples)} samples ready")
     def get_dataloader(self, batch_size: int) -> DataLoader:
+        """Returns a validation DataLoader."""
         return DataLoader(
             self.samples,
             batch_size=batch_size,
 def _collate_fn(batch: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+    """Combines samples in a batch into a single tensor.
+    Because of sequence packing, all samples have the same length (max_seq_len),
+    so no additional padding is needed.
     """
     return {
         "input_ids": torch.stack([s["input_ids"] for s in batch]),

llm_lab/data/diagnostics.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""데이터 파이프라인 진단 도구."""
 import time
 from typing import Dict
@@ -11,13 +11,13 @@ from .tokenizer import Tokenizer
 class DataPipelineDiagnostics:
-    """데이터 파이프라인의 성능과 품질을 진단합니다.
-    학습 전 반드시 확인해야 할 항목:
-      1) 토크나이저 품질: 평균 토큰/문서, 알 수 없는 토큰 비율
-      2) 패킹 효율: 실제 토큰 비율 vs 패딩 비율
-      3) 처리 속도: tokens/sec (데이터 로딩 병목 확인)
-      4) 배치 형태: shape, dtype 정확성
     """
     @staticmethod
@@ -26,11 +26,11 @@ class DataPipelineDiagnostics:
         config: DataConfig,
         num_samples: int = 1000,
     ):
-        """토크나이저 품질을 진단합니다."""
         from datasets import load_dataset
         print("\n" + "=" * 60)
-        print("📊 토크나이저 품질 진단")
         print("=" * 60)
         ds = load_dataset(
@@ -59,24 +59,24 @@ class DataPipelineDiagnostics:
         avg_tokens = sum(token_counts) / len(token_counts)
         avg_chars = sum(char_counts) / len(char_counts)
-        compression_ratio = avg_chars / avg_tokens  # 문자/토큰 비율
-        print(f"  분석 문서 수: {len(token_counts):,}")
-        print(f"  평균 토큰/문서: {avg_tokens:.1f}")
-        print(f"  평균 문자/문서: {avg_chars:.1f}")
-        print(f"  압축 비율 (문자/토큰): {compression_ratio:.2f}")
-        print(f"    → 영어 기준 3.5~4.5가 정상")
-        print(f"  최소 토큰: {min(token_counts)}, 최대: {max(token_counts)}")
-        # 디코드 왕복 테스트
         test_text = "The quick brown fox jumps over the lazy dog."
         encoded = tokenizer.encode(test_text)
         decoded = tokenizer.decode(encoded)
         roundtrip_ok = test_text.strip() in decoded.strip()
-        print(f"\n  왕복 테스트: {'✅ 통과' if roundtrip_ok else '❌ 실패'}")
-        print(f"    원본:  {test_text}")
-        print(f"    인코딩: {encoded[:20]}{'...' if len(encoded) > 20 else ''}")
-        print(f"    디코딩: {decoded}")
     @staticmethod
     def benchmark_throughput(
@@ -84,13 +84,13 @@ class DataPipelineDiagnostics:
         num_batches: int = 50,
         seq_len: int = 2048,
     ):
-        """데이터 로딩 처리량을 측정합니다.
-        GPU 학습 속도의 병목이 데이터 로딩인지 확인하는 핵심 진단.
-        목표: 데이터 로딩이 GPU 연산보다 빨라야 함 (data loading ≠ bottleneck).
         """
         print("\n" + "=" * 60)
-        print("⚡ 데이터 로딩 처리량 벤치마크")
         print("=" * 60)
         total_tokens = 0
@@ -110,23 +110,23 @@ class DataPipelineDiagnostics:
         elapsed = time.time() - start_time
         tps = total_tokens / elapsed
-        print(f"\n  총 배치 수: {num_batches}")
-        print(f"  총 토큰 수: {total_tokens:,}")
-        print(f"  소요 시간: {elapsed:.2f}초")
-        print(f"  평균 처리량: {tps:,.0f} tokens/sec")
-        print(f"\n  💡 A100 학습 처리량 ~50-80K tokens/sec 기준:")
         if tps > 80_000:
-            print(f"     ✅ 데이터 로딩이 병목이 아닙��다")
         elif tps > 30_000:
-            print(f"     ⚠️ 경계선 - num_workers 증가를 고려하세요")
         else:
-            print(f"     ❌ 데이터 로딩이 병목! num_workers/prefetch 조정 필요")
     @staticmethod
     def inspect_batch(batch: Dict[str, torch.Tensor], tokenizer: Tokenizer):
-        """배치 하나를 상세 검사합니다."""
         print("\n" + "=" * 60)
-        print("🔍 배치 상세 검사")
         print("=" * 60)
         input_ids = batch["input_ids"]
@@ -135,19 +135,19 @@ class DataPipelineDiagnostics:
         print(f"  input_ids shape: {input_ids.shape}")
         print(f"  targets shape:   {targets.shape}")
         print(f"  dtype:           {input_ids.dtype}")
-        print(f"  값 범위:         [{input_ids.min().item()}, {input_ids.max().item()}]")
-        # Shift 관계 확인: targets[i] == input_ids[i+1]
         shift_correct = (input_ids[:, 1:] == targets[:, :-1]).float().mean().item()
-        print(f"  Shift 정합성:    {shift_correct*100:.1f}% (100%여야 정상)")
-        # EOS 토큰 분포 (문서 경계)
         eos_count = (input_ids == tokenizer.eos_id).sum().item()
         total_tokens = input_ids.numel()
-        print(f"  EOS 토큰 수:     {eos_count} / {total_tokens} ({eos_count/total_tokens*100:.2f}%)")
-        # 첫 번째 샘플 디코딩 미리보기
         first_sample = input_ids[0][:100].tolist()
         decoded_preview = tokenizer.decode(first_sample)
-        print(f"\n  첫 샘플 디코딩 (처음 100 토큰):")
         print(f"  {decoded_preview[:300]}...")

+"""Data pipeline diagnostic tools."""
 import time
 from typing import Dict
 class DataPipelineDiagnostics:
+    """Diagnoses the performance and quality of the data pipeline.
+    Items to verify before training:
+      1) Tokenizer quality: average tokens/document, unknown token ratio
+      2) Packing efficiency: actual token ratio vs. padding ratio
+      3) Throughput: tokens/sec (check for data loading bottlenecks)
+      4) Batch shape: correctness of shape and dtype
     """
     @staticmethod
         config: DataConfig,
         num_samples: int = 1000,
     ):
+        """Diagnoses tokenizer quality."""
         from datasets import load_dataset
         print("\n" + "=" * 60)
+        print("Tokenizer Quality Diagnostics")
         print("=" * 60)
         ds = load_dataset(
         avg_tokens = sum(token_counts) / len(token_counts)
         avg_chars = sum(char_counts) / len(char_counts)
+        compression_ratio = avg_chars / avg_tokens  # Characters per token ratio
+        print(f"  Documents analyzed: {len(token_counts):,}")
+        print(f"  Average tokens/document: {avg_tokens:.1f}")
+        print(f"  Average chars/document: {avg_chars:.1f}")
+        print(f"  Compression ratio (chars/token): {compression_ratio:.2f}")
+        print(f"    -> 3.5~4.5 is normal for English")
+        print(f"  Min tokens: {min(token_counts)}, Max: {max(token_counts)}")
+        # Round-trip decode test
         test_text = "The quick brown fox jumps over the lazy dog."
         encoded = tokenizer.encode(test_text)
         decoded = tokenizer.decode(encoded)
         roundtrip_ok = test_text.strip() in decoded.strip()
+        print(f"\n  Round-trip test: {'PASSED' if roundtrip_ok else 'FAILED'}")
+        print(f"    Original:  {test_text}")
+        print(f"    Encoded: {encoded[:20]}{'...' if len(encoded) > 20 else ''}")
+        print(f"    Decoded: {decoded}")
     @staticmethod
     def benchmark_throughput(
         num_batches: int = 50,
         seq_len: int = 2048,
     ):
+        """Measures data loading throughput.
+        A key diagnostic to determine whether data loading is the bottleneck in GPU training.
+        Goal: data loading should be faster than GPU computation (data loading != bottleneck).
         """
         print("\n" + "=" * 60)
+        print("Data Loading Throughput Benchmark")
         print("=" * 60)
         total_tokens = 0
         elapsed = time.time() - start_time
         tps = total_tokens / elapsed
+        print(f"\n  Total batches: {num_batches}")
+        print(f"  Total tokens: {total_tokens:,}")
+        print(f"  Elapsed time: {elapsed:.2f}s")
+        print(f"  Average throughput: {tps:,.0f} tokens/sec")
+        print(f"\n  A100 training throughput reference ~50-80K tokens/sec:")
         if tps > 80_000:
+            print(f"     Data loading is not the bottleneck")
         elif tps > 30_000:
+            print(f"     Borderline - consider increasing num_workers")
         else:
+            print(f"     Data loading is the bottleneck! Adjust num_workers/prefetch")
     @staticmethod
     def inspect_batch(batch: Dict[str, torch.Tensor], tokenizer: Tokenizer):
+        """Inspects a single batch in detail."""
         print("\n" + "=" * 60)
+        print("Batch Detailed Inspection")
         print("=" * 60)
         input_ids = batch["input_ids"]
         print(f"  input_ids shape: {input_ids.shape}")
         print(f"  targets shape:   {targets.shape}")
         print(f"  dtype:           {input_ids.dtype}")
+        print(f"  value range:     [{input_ids.min().item()}, {input_ids.max().item()}]")
+        # Verify shift relationship: targets[i] == input_ids[i+1]
         shift_correct = (input_ids[:, 1:] == targets[:, :-1]).float().mean().item()
+        print(f"  Shift consistency: {shift_correct*100:.1f}% (should be 100%)")
+        # EOS token distribution (document boundaries)
         eos_count = (input_ids == tokenizer.eos_id).sum().item()
         total_tokens = input_ids.numel()
+        print(f"  EOS token count: {eos_count} / {total_tokens} ({eos_count/total_tokens*100:.2f}%)")
+        # Decode preview of the first sample
         first_sample = input_ids[0][:100].tolist()
         decoded_preview = tokenizer.decode(first_sample)
+        print(f"\n  First sample decoded (first 100 tokens):")
         print(f"  {decoded_preview[:300]}...")

llm_lab/data/pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""데이터 파이프라인 통합 — DataLoader 생성, 토크나이저 학습, Quick Start."""
 from typing import Optional
@@ -15,12 +15,12 @@ def create_train_dataloader(
     config: DataConfig,
     seed: int = 42,
 ) -> DataLoader:
-    """학습용 DataLoader를 생성합니다.
     Returns:
-        무한히 반복되는 스트리밍 DataLoader
-    사용법:
         dataloader = create_train_dataloader(tokenizer, config)
         for step, batch in enumerate(dataloader):
             input_ids = batch["input_ids"].to(device)  # (B, seq_len)
@@ -40,7 +40,7 @@ def create_train_dataloader(
         batch_size=config.batch_size,
         num_workers=config.num_workers,
         prefetch_factor=config.prefetch_factor if config.num_workers > 0 else None,
-        pin_memory=True,     # GPU 전송 속도 향상
         collate_fn=_collate_fn,
     )
@@ -48,17 +48,17 @@ def create_train_dataloader(
 def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
-    """데이터셋에서 BPE 토크나이저를 학습합니다.
-    전체 데이터를 다 사용할 필요 없이, 50K 문서면 충분합니다.
-    토크나이저 vocab은 전체 데이터의 통계를 반영하면 되므로.
     """
     from datasets import load_dataset
-    print(f"[Train Tokenizer] {config.dataset_name}에서 토크나이저 학습")
-    print(f"[Train Tokenizer] 학습 문서 수: {config.tokenizer_train_samples:,}")
-    # 텍스트 이터레이터 생성
     ds = load_dataset(
         config.dataset_name,
         name=config.dataset_subset,
@@ -77,9 +77,9 @@ def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
                 yield text
                 count += 1
                 if count % 10_000 == 0:
-                    print(f"  ... {count:,} 문서 처리")
-    # 토크나이저 학습
     tokenizer = Tokenizer(config)
     tokenizer.train_bpe(text_iterator(), save_dir=config.tokenizer_save_dir)
@@ -91,38 +91,38 @@ def setup_data_pipeline(
     tokenizer_path: Optional[str] = None,
     config: Optional[DataConfig] = None,
 ) -> tuple:
-    """데이터 파이프라인을 한 번에 설정합니다.
     Args:
         tokenizer_mode:
-            "train_new"    - BPE 토크나이저 새로 학습
-            "load_trained" - 이전에 학습한 토크나이저 로드
-            "pretrained"   - HuggingFace 사전학습 토크나이저 사용
         tokenizer_path:
-            "train_new"    → 저장 경로 (기본: ./tokenizer)
-            "load_trained" → 저장된 토크나이저 경로
-            "pretrained"   → HF 모델명 (기본: mistralai/Mistral-7B-v0.1)
     Returns:
         (tokenizer, train_dataloader, val_dataloader)
-    사용 예시 (Colab):
-        # 방법 1: 토크나이저 새로 학습
         tok, train_dl, val_dl = setup_data_pipeline("train_new")
-        # 방법 2: 기존 토크나이저 로드
         tok, train_dl, val_dl = setup_data_pipeline("load_trained", "./tokenizer")
-        # 방법 3: 사전학습 토크나이저 (가장 간편)
         tok, train_dl, val_dl = setup_data_pipeline("pretrained")
     """
     config = config or DataConfig()
     print("=" * 60)
-    print("🚀 데이터 파이프라인 설정")
     print("=" * 60)
-    # ── Step 1: 토크나이저 ──
     tokenizer = Tokenizer(config)
     if tokenizer_mode == "train_new":
@@ -136,21 +136,21 @@ def setup_data_pipeline(
     else:
         raise ValueError(f"Unknown tokenizer_mode: {tokenizer_mode}")
-    # ── Step 2: 학습 DataLoader ──
-    print("\n[DataLoader] 학습 DataLoader 생성...")
     train_dataloader = create_train_dataloader(tokenizer, config)
-    # ── Step 3: 검증 DataLoader ──
-    print("\n[DataLoader] 검증 DataLoader 생성...")
     val_dataset = ValidationDataset(tokenizer, config, num_samples=100)
     val_dataloader = val_dataset.get_dataloader(batch_size=config.batch_size)
     print("\n" + "=" * 60)
-    print("✅ 데이터 파이프라인 설정 완료!")
-    print(f"   토크나이저 vocab: {tokenizer.vocab_size:,}")
-    print(f"   시퀀스 길이: {config.max_seq_len}")
-    print(f"   배치 크기: {config.batch_size}")
-    print(f"   토큰/배치: {config.batch_size * config.max_seq_len:,}")
     print("=" * 60)
     return tokenizer, train_dataloader, val_dataloader

+"""Data pipeline integration — DataLoader creation, tokenizer training, and Quick Start."""
 from typing import Optional
     config: DataConfig,
     seed: int = 42,
 ) -> DataLoader:
+    """Creates a training DataLoader.
     Returns:
+        An infinitely repeating streaming DataLoader
+    Usage:
         dataloader = create_train_dataloader(tokenizer, config)
         for step, batch in enumerate(dataloader):
             input_ids = batch["input_ids"].to(device)  # (B, seq_len)
         batch_size=config.batch_size,
         num_workers=config.num_workers,
         prefetch_factor=config.prefetch_factor if config.num_workers > 0 else None,
+        pin_memory=True,     # Improves GPU transfer speed
         collate_fn=_collate_fn,
     )
 def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
+    """Trains a BPE tokenizer from the dataset.
+    There is no need to use the entire dataset; 50K documents is sufficient,
+    since the tokenizer vocab only needs to reflect the statistics of the full data.
     """
     from datasets import load_dataset
+    print(f"[Train Tokenizer] Training tokenizer from {config.dataset_name}")
+    print(f"[Train Tokenizer] Number of training documents: {config.tokenizer_train_samples:,}")
+    # Create text iterator
     ds = load_dataset(
         config.dataset_name,
         name=config.dataset_subset,
                 yield text
                 count += 1
                 if count % 10_000 == 0:
+                    print(f"  ... {count:,} documents processed")
+    # Train tokenizer
     tokenizer = Tokenizer(config)
     tokenizer.train_bpe(text_iterator(), save_dir=config.tokenizer_save_dir)
     tokenizer_path: Optional[str] = None,
     config: Optional[DataConfig] = None,
 ) -> tuple:
+    """Sets up the data pipeline in one call.
     Args:
         tokenizer_mode:
+            "train_new"    - Train a new BPE tokenizer
+            "load_trained" - Load a previously trained tokenizer
+            "pretrained"   - Use a pretrained HuggingFace tokenizer
         tokenizer_path:
+            "train_new"    -> Save directory (default: ./tokenizer)
+            "load_trained" -> Path to the saved tokenizer
+            "pretrained"   -> HF model name (default: mistralai/Mistral-7B-v0.1)
     Returns:
         (tokenizer, train_dataloader, val_dataloader)
+    Example usage (Colab):
+        # Method 1: Train a new tokenizer
         tok, train_dl, val_dl = setup_data_pipeline("train_new")
+        # Method 2: Load an existing tokenizer
         tok, train_dl, val_dl = setup_data_pipeline("load_trained", "./tokenizer")
+        # Method 3: Use a pretrained tokenizer (simplest)
         tok, train_dl, val_dl = setup_data_pipeline("pretrained")
     """
     config = config or DataConfig()
     print("=" * 60)
+    print("Data Pipeline Setup")
     print("=" * 60)
+    # ── Step 1: Tokenizer ──
     tokenizer = Tokenizer(config)
     if tokenizer_mode == "train_new":
     else:
         raise ValueError(f"Unknown tokenizer_mode: {tokenizer_mode}")
+    # ── Step 2: Training DataLoader ──
+    print("\n[DataLoader] Creating training DataLoader...")
     train_dataloader = create_train_dataloader(tokenizer, config)
+    # ── Step 3: Validation DataLoader ──
+    print("\n[DataLoader] Creating validation DataLoader...")
     val_dataset = ValidationDataset(tokenizer, config, num_samples=100)
     val_dataloader = val_dataset.get_dataloader(batch_size=config.batch_size)
     print("\n" + "=" * 60)
+    print("Data pipeline setup complete!")
+    print(f"   Tokenizer vocab: {tokenizer.vocab_size:,}")
+    print(f"   Sequence length: {config.max_seq_len}")
+    print(f"   Batch size: {config.batch_size}")
+    print(f"   Tokens/batch: {config.batch_size * config.max_seq_len:,}")
     print("=" * 60)
     return tokenizer, train_dataloader, val_dataloader

llm_lab/data/tokenizer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""토크나이저 래퍼 — SentencePiece / HuggingFace BPE 통합."""
 import os
 import json
@@ -8,23 +8,23 @@ from llm_lab.config import DataConfig
 class Tokenizer:
-    """토크나이저 통합 래퍼.
-    세 가지 방법 지원:
-      1) 기존 SentencePiece 모델 로드
-      2) HuggingFace tokenizers 라이브러리로 새로 학습
-      3) 사전 학습된 HF 토크나이저 로드 (예: LLaMA tokenizer)
-    왜 직접 구현하지 않는가?
-      - BPE 토크나이저 학습은 대규모 텍스트 통계 처리이며,
-        모델 아키텍처 이해와 직접적 관련이 적습니다.
-      - 다만 토크나이저의 동작 원리(BPE 병합 규칙)는 이해해야 합니다.
-    BPE(Byte Pair Encoding) 핵심 원리:
-      1) 텍스트를 바이트/문자 단위로 분리
-      2) 가장 빈번한 인접 쌍을 반복적으로 병합
-      3) vocab_size에 도달할 때까지 반복
-      → 자주 등장하는 단어는 하나의 토큰, 희귀 단어는 여러 토큰으로 분리
     """
     def __init__(self, config: DataConfig):
@@ -32,17 +32,17 @@ class Tokenizer:
         self._tokenizer = None
         self.vocab_size = config.vocab_size
-        # 특수 토큰 ID (초기화 후 설정됨)
         self.bos_id: int = 1   # Beginning of Sequence
         self.eos_id: int = 2   # End of Sequence
         self.pad_id: int = 0   # Padding
     # ────────────────────────────────────────────────
-    # 방법 1: SentencePiece 모델 로드
     # ────────────────────────────────────────────────
     def load_sentencepiece(self, model_path: str):
-        """기존 SentencePiece 모델을 로드합니다."""
         import sentencepiece as spm
         self._tokenizer = spm.SentencePieceProcessor()
@@ -55,23 +55,23 @@ class Tokenizer:
         self._encode_fn = self._tokenizer.Encode
         self._decode_fn = self._tokenizer.Decode
-        print(f"[Tokenizer] SentencePiece 로드 완료: vocab_size={self.vocab_size}")
     # ────────────────────────────────────────────────
-    # 방법 2: HuggingFace tokenizers로 BPE 학습
     # ────────────────────────────────────────────────
     def train_bpe(self, text_iterator: Iterator[str], save_dir: Optional[str] = None):
-        """BPE 토크나이저를 처음부터 학습합니다.
         Args:
-            text_iterator: 학습 텍스트를 yield하는 이터레이터
-            save_dir: 저장 경로
-        학습 포인트:
-          - vocab_size가 클수록: 자주 쓰는 표현이 1토큰 → 시퀀스 짧아짐
-          - vocab_size가 작을수록: Embedding 파라미터 절약, 하지만 시퀀스 길어짐
-          - 32K는 영어 기준 좋은 균형점
         """
         from tokenizers import Tokenizer as HFTokenizer
         from tokenizers.models import BPE
@@ -79,27 +79,27 @@ class Tokenizer:
         from tokenizers.pre_tokenizers import ByteLevel
         from tokenizers.processors import TemplateProcessing
-        print("[Tokenizer] BPE 토크나이저 ���습 시작...")
-        # BPE 모델 생성
         tokenizer = HFTokenizer(BPE(unk_token="<unk>"))
         tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
-        # 특수 토큰 정의
         special_tokens = ["<pad>", "<s>", "</s>", "<unk>"]
-        # 트레이너 설정
         trainer = BpeTrainer(
             vocab_size=self.config.vocab_size,
             special_tokens=special_tokens,
-            min_frequency=2,           # 최소 2번 등장한 쌍만 병합
             show_progress=True,
         )
-        # 학습 실행
         tokenizer.train_from_iterator(text_iterator, trainer=trainer)
-        # 후처리: BOS/EOS 자동 추가
         tokenizer.post_processor = TemplateProcessing(
             single="<s> $A </s>",
             special_tokens=[("<s>", 1), ("</s>", 2)],
@@ -114,11 +114,11 @@ class Tokenizer:
         self._encode_fn = lambda text: tokenizer.encode(text).ids
         self._decode_fn = lambda ids: tokenizer.decode(ids)
-        # 저장
         save_dir = save_dir or self.config.tokenizer_save_dir
         os.makedirs(save_dir, exist_ok=True)
         tokenizer.save(os.path.join(save_dir, "tokenizer.json"))
-        # 메타 정보 저장
         meta = {
             "vocab_size": self.vocab_size,
             "bos_id": self.bos_id,
@@ -128,23 +128,23 @@ class Tokenizer:
         with open(os.path.join(save_dir, "tokenizer_meta.json"), "w") as f:
             json.dump(meta, f, indent=2)
-        print(f"[Tokenizer] 학습 완료: vocab_size={self.vocab_size}")
-        print(f"[Tokenizer] 저장 위치: {save_dir}")
     # ────────────────────────────────────────────────
-    # 방법 3: 사전 학습된 HF 토크나이저 로드
     # ────────────────────────────────────────────────
     def load_pretrained_hf(self, name_or_path: str = "meta-llama/Llama-2-7b-hf"):
-        """HuggingFace에서 사전 학습된 토크나이저를 로드합니다.
-        가장 간편한 방법. LLaMA 토크나이저는 32K vocab, BPE 기반.
-        주의: meta-llama 모델은 HF 승인이 필요할 수 있음.
-        대안: mistralai/Mistral-7B-v0.1 (승인 불필요)
         """
         from transformers import AutoTokenizer
-        print(f"[Tokenizer] HF 토크나이저 로드: {name_or_path}")
         tokenizer = AutoTokenizer.from_pretrained(name_or_path)
         self._tokenizer = tokenizer
@@ -156,10 +156,10 @@ class Tokenizer:
         self._encode_fn = lambda text: tokenizer.encode(text, add_special_tokens=False)
         self._decode_fn = lambda ids: tokenizer.decode(ids)
-        print(f"[Tokenizer] 로드 완료: vocab_size={self.vocab_size}")
     def load_trained_hf(self, path: str):
-        """train_bpe()로 학습한 토크나이저를 다시 로드합니다."""
         from tokenizers import Tokenizer as HFTokenizer
         tokenizer = HFTokenizer.from_file(os.path.join(path, "tokenizer.json"))
@@ -175,21 +175,21 @@ class Tokenizer:
         self._encode_fn = lambda text: tokenizer.encode(text).ids
         self._decode_fn = lambda ids: tokenizer.decode(ids)
-        print(f"[Tokenizer] 로드 완료: vocab_size={self.vocab_size}")
     # ────────────────────────────────────────────────
-    # 공통 인터페이스
     # ────────────────────────────────────────────────
     def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
-        """텍스트 → 토큰 ID 리스트."""
         ids = self._encode_fn(text)
         if add_special_tokens:
             ids = [self.bos_id] + ids + [self.eos_id]
         return ids
     def decode(self, ids: List[int]) -> str:
-        """토큰 ID 리스트 → 텍스트."""
         return self._decode_fn(ids)
     def __len__(self) -> int:

+"""Tokenizer wrapper — SentencePiece / HuggingFace BPE integration."""
 import os
 import json
 class Tokenizer:
+    """Unified tokenizer wrapper.
+    Supports three methods:
+      1) Load an existing SentencePiece model
+      2) Train a new tokenizer using the HuggingFace tokenizers library
+      3) Load a pretrained HF tokenizer (e.g., LLaMA tokenizer)
+    Why not implement from scratch?
+      - Training a BPE tokenizer involves large-scale text statistics processing,
+        which has little direct relevance to understanding model architecture.
+      - However, understanding how a tokenizer works (BPE merge rules) is still important.
+    BPE (Byte Pair Encoding) core principle:
+      1) Split text into byte/character units
+      2) Repeatedly merge the most frequent adjacent pair
+      3) Repeat until vocab_size is reached
+      → Frequent words become a single token; rare words are split into multiple tokens
     """
     def __init__(self, config: DataConfig):
         self._tokenizer = None
         self.vocab_size = config.vocab_size
+        # Special token IDs (set after initialization)
         self.bos_id: int = 1   # Beginning of Sequence
         self.eos_id: int = 2   # End of Sequence
         self.pad_id: int = 0   # Padding
     # ────────────────────────────────────────────────
+    # Method 1: Load a SentencePiece model
     # ────────────────────────────────────────────────
     def load_sentencepiece(self, model_path: str):
+        """Loads an existing SentencePiece model."""
         import sentencepiece as spm
         self._tokenizer = spm.SentencePieceProcessor()
         self._encode_fn = self._tokenizer.Encode
         self._decode_fn = self._tokenizer.Decode
+        print(f"[Tokenizer] SentencePiece loaded: vocab_size={self.vocab_size}")
     # ────────────────────────────────────────────────
+    # Method 2: Train a BPE tokenizer with HuggingFace tokenizers
     # ────────────────────────────────────────────────
     def train_bpe(self, text_iterator: Iterator[str], save_dir: Optional[str] = None):
+        """Trains a BPE tokenizer from scratch.
         Args:
+            text_iterator: Iterator that yields training text strings
+            save_dir: Directory path to save the trained tokenizer
+        Key insights:
+          - Larger vocab_size: common expressions become 1 token → shorter sequences
+          - Smaller vocab_size: saves embedding parameters, but sequences get longer
+          - 32K is a good balance point for English
         """
         from tokenizers import Tokenizer as HFTokenizer
         from tokenizers.models import BPE
         from tokenizers.pre_tokenizers import ByteLevel
         from tokenizers.processors import TemplateProcessing
+        print("[Tokenizer] Starting BPE tokenizer training...")
+        # Create BPE model
         tokenizer = HFTokenizer(BPE(unk_token="<unk>"))
         tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
+        # Define special tokens
         special_tokens = ["<pad>", "<s>", "</s>", "<unk>"]
+        # Configure trainer
         trainer = BpeTrainer(
             vocab_size=self.config.vocab_size,
             special_tokens=special_tokens,
+            min_frequency=2,           # Only merge pairs that appear at least twice
             show_progress=True,
         )
+        # Run training
         tokenizer.train_from_iterator(text_iterator, trainer=trainer)
+        # Post-processing: automatically add BOS/EOS
         tokenizer.post_processor = TemplateProcessing(
             single="<s> $A </s>",
             special_tokens=[("<s>", 1), ("</s>", 2)],
         self._encode_fn = lambda text: tokenizer.encode(text).ids
         self._decode_fn = lambda ids: tokenizer.decode(ids)
+        # Save
         save_dir = save_dir or self.config.tokenizer_save_dir
         os.makedirs(save_dir, exist_ok=True)
         tokenizer.save(os.path.join(save_dir, "tokenizer.json"))
+        # Save metadata
         meta = {
             "vocab_size": self.vocab_size,
             "bos_id": self.bos_id,
         with open(os.path.join(save_dir, "tokenizer_meta.json"), "w") as f:
             json.dump(meta, f, indent=2)
+        print(f"[Tokenizer] Training complete: vocab_size={self.vocab_size}")
+        print(f"[Tokenizer] Saved to: {save_dir}")
     # ────────────────────────────────────────────────
+    # Method 3: Load a pretrained HF tokenizer
     # ────────────────────────────────────────────────
     def load_pretrained_hf(self, name_or_path: str = "meta-llama/Llama-2-7b-hf"):
+        """Loads a pretrained tokenizer from HuggingFace.
+        The simplest method. The LLaMA tokenizer has a 32K vocab and is BPE-based.
+        Note: meta-llama models may require HF approval to access.
+        Alternative: mistralai/Mistral-7B-v0.1 (no approval required)
         """
         from transformers import AutoTokenizer
+        print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
         tokenizer = AutoTokenizer.from_pretrained(name_or_path)
         self._tokenizer = tokenizer
         self._encode_fn = lambda text: tokenizer.encode(text, add_special_tokens=False)
         self._decode_fn = lambda ids: tokenizer.decode(ids)
+        print(f"[Tokenizer] Loaded: vocab_size={self.vocab_size}")
     def load_trained_hf(self, path: str):
+        """Reloads a tokenizer previously trained with train_bpe()."""
         from tokenizers import Tokenizer as HFTokenizer
         tokenizer = HFTokenizer.from_file(os.path.join(path, "tokenizer.json"))
         self._encode_fn = lambda text: tokenizer.encode(text).ids
         self._decode_fn = lambda ids: tokenizer.decode(ids)
+        print(f"[Tokenizer] Loaded: vocab_size={self.vocab_size}")
     # ────────────────────────────────────────────────
+    # Common interface
     # ────────────────────────────────────────────────
     def encode(self, text: str, add_special_tokens: bool = False) -> List[int]:
+        """Text → list of token IDs."""
         ids = self._encode_fn(text)
         if add_special_tokens:
             ids = [self.bos_id] + ids + [self.eos_id]
         return ids
     def decode(self, ids: List[int]) -> str:
+        """List of token IDs → text."""
         return self._decode_fn(ids)
     def __len__(self) -> int:

llm_lab/evaluation/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""평가 모듈 — Perplexity, 텍스트 생성, Scaling Law, Attention 시각화."""
 from .perplexity import PerplexityEvaluator
 from .generation import GenerationEvaluator

+"""Evaluation module — Perplexity, text generation, Scaling Law, Attention visualization."""
 from .perplexity import PerplexityEvaluator
 from .generation import GenerationEvaluator

llm_lab/evaluation/attention_viz.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Attention 패턴 시각화."""
 import math
 from pathlib import Path
@@ -18,15 +18,15 @@ except ImportError:
 class AttentionVisualizer:
-    """Attention 패턴을 시각화합니다.
-    학습 포인트:
-      - Causal Mask: 하삼각 패턴 (미래 토큰은 볼 수 없음)
-      - 헤드별 역할 분화: 일부는 로컬(인접), 일부는 글로벌(먼 토큰) 주목
-      - 구문론적 패턴: 동사→주어, 대명사→선행사 등에 높은 attention
-    주의: 1B 모델의 전체 attention을 저장하면 메모리 부족!
-    → 특정 레이어/헤드만 선택적으로 시각화합니다.
     """
     def __init__(self, save_dir: str = "./eval_results"):
@@ -41,10 +41,10 @@ class AttentionVisualizer:
         layer_idx: int = 0,
         device: torch.device = torch.device("cpu"),
     ) -> torch.Tensor:
-        """특정 레이어의 attention weight를 추출합니다.
-        모델의 attention 모듈을 일시적으로 수정하여
-        attention weight를 캡처합니다.
         Returns:
             attention_weights: (num_heads, seq_len, seq_len)
@@ -52,10 +52,10 @@ class AttentionVisualizer:
         model.eval()
         captured_attn = {}
-        # Hook으로 attention weight 캡처
         target_layer = model.layers[layer_idx].attention
-        # scaled_dot_product_attention을 수동 구현으로 대체
         original_forward = target_layer.forward
         def hooked_forward(x, mask=None, position_offset=0):
@@ -72,7 +72,7 @@ class AttentionVisualizer:
                 k = target_layer._repeat_kv(k)
                 v = target_layer._repeat_kv(v)
-            # 수동 attention 계산 (weight 추출용)
             scale = 1.0 / math.sqrt(hd)
             scores = torch.matmul(q, k.transpose(-2, -1)) * scale
@@ -81,13 +81,13 @@ class AttentionVisualizer:
             scores.masked_fill_(causal.unsqueeze(0).unsqueeze(0), float("-inf"))
             attn_weights = F.softmax(scores, dim=-1)
-            captured_attn["weights"] = attn_weights[0].cpu()  # 첫 배치만
             out = torch.matmul(attn_weights, v)
             out = out.transpose(1, 2).contiguous().view(B, S, -1)
             return target_layer.o_proj(out)
-        # Hook 적용
         target_layer.forward = hooked_forward
         try:
@@ -105,13 +105,13 @@ class AttentionVisualizer:
         save_path: Optional[str] = None,
         title: str = "Attention Weights",
     ):
-        """Attention heatmap을 그립니다."""
         if not HAS_MATPLOTLIB:
-            print("⚠️ matplotlib가 필요합니다")
             return
         weights = attn_weights[head_idx].numpy()
-        max_len = min(len(tokens), 50)  # 최대 50 토큰만 표시
         weights = weights[:max_len, :max_len]
         display_tokens = tokens[:max_len]
@@ -132,7 +132,7 @@ class AttentionVisualizer:
         save_path = save_path or str(self.save_dir / f"attention_head{head_idx}.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
-        print(f"  📊 Attention 시각화 저장: {save_path}")
         plt.close(fig)
     def plot_multi_head_summary(
@@ -141,7 +141,7 @@ class AttentionVisualizer:
         num_heads_to_show: int = 8,
         save_path: Optional[str] = None,
     ):
-        """여러 헤드의 attention 패턴을 요약 비교합니다."""
         if not HAS_MATPLOTLIB:
             return
@@ -162,7 +162,7 @@ class AttentionVisualizer:
             ax.set_xticks([])
             ax.set_yticks([])
-        # 빈 subplot 숨기기
         for idx in range(n_heads, rows * cols):
             r, c = idx // cols, idx % cols
             axes[r, c].axis("off")
@@ -172,5 +172,5 @@ class AttentionVisualizer:
         save_path = save_path or str(self.save_dir / "attention_multi_head.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
-        print(f"  📊 멀티 헤드 요약 저장: {save_path}")
         plt.close(fig)

+"""Attention pattern visualization."""
 import math
 from pathlib import Path
 class AttentionVisualizer:
+    """Visualizes attention patterns.
+    Learning insights:
+      - Causal Mask: lower-triangular pattern (future tokens cannot be attended to)
+      - Head specialization: some heads focus locally (adjacent), others globally (distant tokens)
+      - Syntactic patterns: high attention on verb→subject, pronoun→antecedent, etc.
+    Note: Storing the full attention of a 1B model causes out-of-memory!
+    → Visualize only selected layers/heads.
     """
     def __init__(self, save_dir: str = "./eval_results"):
         layer_idx: int = 0,
         device: torch.device = torch.device("cpu"),
     ) -> torch.Tensor:
+        """Extracts attention weights from a specific layer.
+        Temporarily modifies the model's attention module to
+        capture attention weights.
         Returns:
             attention_weights: (num_heads, seq_len, seq_len)
         model.eval()
         captured_attn = {}
+        # Capture attention weights via hook
         target_layer = model.layers[layer_idx].attention
+        # Replace scaled_dot_product_attention with a manual implementation
         original_forward = target_layer.forward
         def hooked_forward(x, mask=None, position_offset=0):
                 k = target_layer._repeat_kv(k)
                 v = target_layer._repeat_kv(v)
+            # Manual attention computation (for weight extraction)
             scale = 1.0 / math.sqrt(hd)
             scores = torch.matmul(q, k.transpose(-2, -1)) * scale
             scores.masked_fill_(causal.unsqueeze(0).unsqueeze(0), float("-inf"))
             attn_weights = F.softmax(scores, dim=-1)
+            captured_attn["weights"] = attn_weights[0].cpu()  # first batch only
             out = torch.matmul(attn_weights, v)
             out = out.transpose(1, 2).contiguous().view(B, S, -1)
             return target_layer.o_proj(out)
+        # Apply hook
         target_layer.forward = hooked_forward
         try:
         save_path: Optional[str] = None,
         title: str = "Attention Weights",
     ):
+        """Draws an attention heatmap."""
         if not HAS_MATPLOTLIB:
+            print("⚠️ matplotlib required")
             return
         weights = attn_weights[head_idx].numpy()
+        max_len = min(len(tokens), 50)  # display at most 50 tokens
         weights = weights[:max_len, :max_len]
         display_tokens = tokens[:max_len]
         save_path = save_path or str(self.save_dir / f"attention_head{head_idx}.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        print(f"  📊 Attention visualization saved: {save_path}")
         plt.close(fig)
     def plot_multi_head_summary(
         num_heads_to_show: int = 8,
         save_path: Optional[str] = None,
     ):
+        """Summarizes and compares attention patterns across multiple heads."""
         if not HAS_MATPLOTLIB:
             return
             ax.set_xticks([])
             ax.set_yticks([])
+        # Hide empty subplots
         for idx in range(n_heads, rows * cols):
             r, c = idx // cols, idx % cols
             axes[r, c].axis("off")
         save_path = save_path or str(self.save_dir / "attention_multi_head.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        print(f"  📊 Multi-head summary saved: {save_path}")
         plt.close(fig)

llm_lab/evaluation/checklist.py CHANGED Viewed

@@ -1,13 +1,13 @@
-"""학습 인사이트 체크리스트 검증기."""
 from typing import Any, Dict, Optional
 class InsightChecklist:
-    """PRD에 정의된 학습 인사이트 체크리스트를 자동/수동으로 검증합니다.
-    자동 검증 가능 항목은 메트릭 기반으로 판정하고,
-    수동 항목은 질문으로 제시합니다.
     """
     @staticmethod
@@ -15,9 +15,9 @@ class InsightChecklist:
         report: Dict[str, Any],
         metrics_history: Optional[Dict[str, list]] = None,
     ):
-        """체크리스트를 실행합니다."""
         print("\n" + "=" * 70)
-        print("✅ 학습 인사이트 체크리스트")
         print("=" * 70)
         checks = {
@@ -26,74 +26,74 @@ class InsightChecklist:
             "manual": [],
         }
-        # ── 자동 검증 ──
-        # 1. Loss 수렴
         if report.get("perplexity", {}).get("loss", 99) < 4.0:
-            checks["passed"].append("모델 Loss가 4.0 이하로 수렴")
         else:
-            checks["failed"].append("모델 Loss가 4.0 이하로 미수렴")
-        # 2. Loss 스파이크
         spikes = report.get("training_dynamics", {}).get("loss", {}).get("spikes", [])
         if len(spikes) < 5:
-            checks["passed"].append(f"Loss 스파이크 {len(spikes)}회 (< 5회)")
         else:
-            checks["failed"].append(f"Loss 스파이크 {len(spikes)}회 (≥ 5회, 안정성 개선 필요)")
-        # 3. 위치별 Loss 패턴
         if report.get("position_losses"):
             early = report["position_losses"]["early_avg"]
             late = report["position_losses"]["late_avg"]
             if early > late:
-                checks["passed"].append("위치별 Loss 감소 패턴 확인 (컨텍스트 활용)")
             else:
-                checks["failed"].append("위치별 Loss 패턴 이상 (컨텍스트 미활용?)")
-        # 4. 생성 반복률
         rep = report.get("generation", {}).get("avg_metrics", {}).get("repetition_rate", 1.0)
         if rep < 0.3:
-            checks["passed"].append(f"생성 반복률 {rep:.1%} (< 30%)")
         else:
-            checks["failed"].append(f"생성 반복률 {rep:.1%} (≥ 30%, temperature/top_p 조정)")
-        # 5. Gradient 클리핑 비율
         if metrics_history and metrics_history.get("grad_norm"):
             gnorms = metrics_history["grad_norm"]
             clip_rate = sum(1 for g in gnorms if g >= 0.99) / max(len(gnorms), 1)
             if clip_rate < 0.3:
-                checks["passed"].append(f"Gradient 클리핑 비율 {clip_rate:.1%} (건강)")
             else:
-                checks["failed"].append(f"Gradient 클리핑 비율 {clip_rate:.1%} (너무 잦음)")
-        # ── 수동 확인 항목 ──
         manual_items = [
-            "Self-Attention에서 Q, K, V 각각의 역할을 설명할 수 있는가?",
-            "RoPE가 위치 정보를 인코딩하는 수학적 원리를 이해하는가?",
-            "GQA가 MHA 대비 메모리를 절약하는 메커니즘을 설명할 수 있는가?",
-            "SwiGLU의 게이팅 메커니즘이 ReLU FFN과 어떻게 다른지 이해하는가?",
-            "Learning Rate Warmup이 왜 필요한지 체감했는가?",
-            "Gradient Accumulation이 큰 배치를 시뮬레이션하는 원리를 이해하는가?",
-            "Mixed Precision(bf16)의 메모리-속도 효과를 측정했는가?",
-            "Activation Checkpointing의 메모리-연산 트레이���오프를 이해하는가?",
         ]
         checks["manual"] = manual_items
-        # ── 출력 ──
         total_auto = len(checks["passed"]) + len(checks["failed"])
         passed_auto = len(checks["passed"])
-        print(f"\n  자동 검증: {passed_auto}/{total_auto} 통과")
         for item in checks["passed"]:
             print(f"    ✅ {item}")
         for item in checks["failed"]:
             print(f"    ❌ {item}")
-        print(f"\n  수동 확인 ({len(manual_items)} 항목):")
         for i, item in enumerate(manual_items, 1):
             print(f"    {i}. [ ] {item}")
-        print(f"\n  총 진행률: {passed_auto}/{total_auto + len(manual_items)} "
-              f"(수동 항목 포함 시)")
         return checks

+"""Training insight checklist validator."""
 from typing import Any, Dict, Optional
 class InsightChecklist:
+    """Automatically and manually validates the training insight checklist defined in the PRD.
+    Items that can be automatically validated are judged based on metrics,
+    while manual items are presented as questions.
     """
     @staticmethod
         report: Dict[str, Any],
         metrics_history: Optional[Dict[str, list]] = None,
     ):
+        """Runs the checklist."""
         print("\n" + "=" * 70)
+        print("✅ Training Insight Checklist")
         print("=" * 70)
         checks = {
             "manual": [],
         }
+        # ── Automatic validation ──
+        # 1. Loss convergence
         if report.get("perplexity", {}).get("loss", 99) < 4.0:
+            checks["passed"].append("Model Loss converged below 4.0")
         else:
+            checks["failed"].append("Model Loss has not converged below 4.0")
+        # 2. Loss spikes
         spikes = report.get("training_dynamics", {}).get("loss", {}).get("spikes", [])
         if len(spikes) < 5:
+            checks["passed"].append(f"Loss spikes: {len(spikes)} (< 5)")
         else:
+            checks["failed"].append(f"Loss spikes: {len(spikes)} (>= 5, stability improvement needed)")
+        # 3. Per-position loss pattern
         if report.get("position_losses"):
             early = report["position_losses"]["early_avg"]
             late = report["position_losses"]["late_avg"]
             if early > late:
+                checks["passed"].append("Per-position loss decrease pattern confirmed (context utilization)")
             else:
+                checks["failed"].append("Per-position loss pattern abnormal (context not utilized?)")
+        # 4. Generation repetition rate
         rep = report.get("generation", {}).get("avg_metrics", {}).get("repetition_rate", 1.0)
         if rep < 0.3:
+            checks["passed"].append(f"Generation repetition rate {rep:.1%} (< 30%)")
         else:
+            checks["failed"].append(f"Generation repetition rate {rep:.1%} (>= 30%, adjust temperature/top_p)")
+        # 5. Gradient clipping rate
         if metrics_history and metrics_history.get("grad_norm"):
             gnorms = metrics_history["grad_norm"]
             clip_rate = sum(1 for g in gnorms if g >= 0.99) / max(len(gnorms), 1)
             if clip_rate < 0.3:
+                checks["passed"].append(f"Gradient clipping rate {clip_rate:.1%} (healthy)")
             else:
+                checks["failed"].append(f"Gradient clipping rate {clip_rate:.1%} (too frequent)")
+        # ── Manual verification items ──
         manual_items = [
+            "Can you explain the individual roles of Q, K, and V in Self-Attention?",
+            "Do you understand the mathematical principle by which RoPE encodes positional information?",
+            "Can you explain the mechanism by which GQA saves memory compared to MHA?",
+            "Do you understand how SwiGLU's gating mechanism differs from a ReLU FFN?",
+            "Did you experience why Learning Rate Warmup is necessary?",
+            "Do you understand the principle by which Gradient Accumulation simulates a large batch?",
+            "Have you measured the memory-speed effect of Mixed Precision (bf16)?",
+            "Do you understand the memory-compute trade-off of Activation Checkpointing?",
         ]
         checks["manual"] = manual_items
+        # ── Output ──
         total_auto = len(checks["passed"]) + len(checks["failed"])
         passed_auto = len(checks["passed"])
+        print(f"\n  Automatic validation: {passed_auto}/{total_auto} passed")
         for item in checks["passed"]:
             print(f"    ✅ {item}")
         for item in checks["failed"]:
             print(f"    ❌ {item}")
+        print(f"\n  Manual verification ({len(manual_items)} items):")
         for i, item in enumerate(manual_items, 1):
             print(f"    {i}. [ ] {item}")
+        print(f"\n  Total progress: {passed_auto}/{total_auto + len(manual_items)} "
+              f"(including manual items)")
         return checks

llm_lab/evaluation/dynamics.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""학습 역학 분석기."""
 import math
 from pathlib import Path
@@ -14,13 +14,13 @@ except ImportError:
 class TrainingDynamicsAnalyzer:
-    """학습 과정의 메트릭을 분석하고 시각화합니다.
-    분석 항목:
-      - Loss 곡선:      수렴 패턴, 스파이크 감지
-      - LR 스케줄:      Warmup + Cosine decay 확인
-      - Gradient Norm:  학습 안정성, 폭발/소멸 감지
-      - 처리량:         tokens/sec 안정성, 병목 감지
     """
     def __init__(self, save_dir: str = "./eval_results"):
@@ -28,21 +28,21 @@ class TrainingDynamicsAnalyzer:
         self.save_dir.mkdir(parents=True, exist_ok=True)
     def analyze_metrics(self, metrics_history: Dict[str, list]) -> Dict[str, Any]:
-        """학습 메트릭을 분석합니다.
         Args:
-            metrics_history: Trainer.metrics.history 딕셔너리
         Returns:
-            분석 결과
         """
         print("\n" + "=" * 70)
-        print("🔬 학습 역학 분석")
         print("=" * 70)
         analysis = {}
-        # ── Loss 분석 ──
         if metrics_history.get("train_loss"):
             losses = metrics_history["train_loss"]
             analysis["loss"] = {
@@ -52,7 +52,7 @@ class TrainingDynamicsAnalyzer:
                 "total_reduction": round(losses[0] - losses[-1], 4),
             }
-            # 스파이크 감지 (이전 값 대비 50% 이상 급증)
             spikes = []
             for i in range(1, len(losses)):
                 if losses[i] > losses[i-1] * 1.5:
@@ -61,17 +61,17 @@ class TrainingDynamicsAnalyzer:
             analysis["loss"]["spikes"] = spikes
-            print(f"\n  📉 Loss 분석:")
-            print(f"    초기:  {analysis['loss']['initial']:.4f}")
-            print(f"    최종:  {analysis['loss']['final']:.4f}")
-            print(f"    최소:  {analysis['loss']['minimum']:.4f}")
-            print(f"    감소:  {analysis['loss']['total_reduction']:.4f}")
-            print(f"    스파이크: {len(spikes)}회")
             if spikes:
                 for s in spikes[:5]:
                     print(f"      Step {s['step']}: Loss = {s['loss']}")
-        # ── Gradient Norm 분석 ──
         if metrics_history.get("grad_norm"):
             gnorms = metrics_history["grad_norm"]
             analysis["grad_norm"] = {
@@ -81,14 +81,14 @@ class TrainingDynamicsAnalyzer:
                 "clipped_pct": round(sum(1 for g in gnorms if g >= 0.99) / len(gnorms) * 100, 1),
             }
-            print(f"\n  📐 Gradient Norm 분석:")
-            print(f"    평균: {analysis['grad_norm']['mean']:.4f}")
-            print(f"    최대: {analysis['grad_norm']['max']:.4f}")
-            print(f"    클리핑 비율: {analysis['grad_norm']['clipped_pct']:.1f}%")
             if analysis["grad_norm"]["clipped_pct"] > 30:
-                print(f"    ⚠️ 클리핑이 잦음 → LR 하향 또는 warmup 연장 고려")
-        # ── 처리량 분석 ──
         if metrics_history.get("tokens_per_sec"):
             tps = metrics_history["tokens_per_sec"]
             tps_valid = [t for t in tps if t > 0]
@@ -100,10 +100,10 @@ class TrainingDynamicsAnalyzer:
                     "max": round(max(tps_valid)),
                 }
-                print(f"\n  ⚡ 처리량 분석:")
-                print(f"    평균: {analysis['throughput']['mean']:,} tokens/sec")
-                print(f"    표준편차: {analysis['throughput']['std']:,}")
-                print(f"    범위: [{analysis['throughput']['min']:,}, {analysis['throughput']['max']:,}]")
         return analysis
@@ -112,9 +112,9 @@ class TrainingDynamicsAnalyzer:
         metrics_history: Dict[str, list],
         save_path: Optional[str] = None,
     ):
-        """학습 곡선을 4-panel 차트로 시각화합니다."""
         if not HAS_MATPLOTLIB:
-            print("⚠️ matplotlib가 필요합니다: pip install matplotlib")
             return
         fig, axes = plt.subplots(2, 2, figsize=(16, 10))
@@ -129,7 +129,7 @@ class TrainingDynamicsAnalyzer:
                     metrics_history["train_loss"],
                     color="#2563eb", alpha=0.6, linewidth=0.8, label="Train Loss")
-            # 이동 평균 (스무딩)
             if len(metrics_history["train_loss"]) > 20:
                 window = min(50, len(metrics_history["train_loss"]) // 5)
                 smoothed = self._moving_average(metrics_history["train_loss"], window)
@@ -192,7 +192,7 @@ class TrainingDynamicsAnalyzer:
         save_path = save_path or str(self.save_dir / "training_curves.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
-        print(f"\n  📊 학습 곡선 저장: {save_path}")
         plt.close(fig)
     def plot_position_loss(
@@ -200,7 +200,7 @@ class TrainingDynamicsAnalyzer:
         position_losses: List[float],
         save_path: Optional[str] = None,
     ):
-        """위치별 Loss 분포를 시각화합니다."""
         if not HAS_MATPLOTLIB:
             return
@@ -215,7 +215,7 @@ class TrainingDynamicsAnalyzer:
         ax.set_title("Loss by Position (earlier positions have less context)", fontsize=13, fontweight="bold")
         ax.grid(True, alpha=0.3)
-        # 주요 구간 표시
         if len(position_losses) > 100:
             early_avg = sum(position_losses[:50]) / 50
             late_avg = sum(position_losses[-200:]) / 200
@@ -229,12 +229,12 @@ class TrainingDynamicsAnalyzer:
         save_path = save_path or str(self.save_dir / "position_loss.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
-        print(f"  📊 위치별 Loss 저장: {save_path}")
         plt.close(fig)
     @staticmethod
     def _moving_average(data: list, window: int) -> list:
-        """이동 평균 계산."""
         result = []
         for i in range(window - 1, len(data)):
             avg = sum(data[i - window + 1 : i + 1]) / window

+"""Training dynamics analyzer."""
 import math
 from pathlib import Path
 class TrainingDynamicsAnalyzer:
+    """Analyzes and visualizes training metrics.
+    Analysis items:
+      - Loss curve:      Convergence patterns, spike detection
+      - LR schedule:     Warmup + Cosine decay verification
+      - Gradient Norm:   Training stability, explosion/vanishing detection
+      - Throughput:      tokens/sec stability, bottleneck detection
     """
     def __init__(self, save_dir: str = "./eval_results"):
         self.save_dir.mkdir(parents=True, exist_ok=True)
     def analyze_metrics(self, metrics_history: Dict[str, list]) -> Dict[str, Any]:
+        """Analyzes training metrics.
         Args:
+            metrics_history: Trainer.metrics.history dictionary
         Returns:
+            Analysis results
         """
         print("\n" + "=" * 70)
+        print("🔬 Training Dynamics Analysis")
         print("=" * 70)
         analysis = {}
+        # ── Loss analysis ──
         if metrics_history.get("train_loss"):
             losses = metrics_history["train_loss"]
             analysis["loss"] = {
                 "total_reduction": round(losses[0] - losses[-1], 4),
             }
+            # Spike detection (sudden increase of 50% or more compared to previous value)
             spikes = []
             for i in range(1, len(losses)):
                 if losses[i] > losses[i-1] * 1.5:
             analysis["loss"]["spikes"] = spikes
+            print(f"\n  📉 Loss Analysis:")
+            print(f"    Initial:   {analysis['loss']['initial']:.4f}")
+            print(f"    Final:     {analysis['loss']['final']:.4f}")
+            print(f"    Minimum:   {analysis['loss']['minimum']:.4f}")
+            print(f"    Reduction: {analysis['loss']['total_reduction']:.4f}")
+            print(f"    Spikes:    {len(spikes)}")
             if spikes:
                 for s in spikes[:5]:
                     print(f"      Step {s['step']}: Loss = {s['loss']}")
+        # ── Gradient Norm analysis ──
         if metrics_history.get("grad_norm"):
             gnorms = metrics_history["grad_norm"]
             analysis["grad_norm"] = {
                 "clipped_pct": round(sum(1 for g in gnorms if g >= 0.99) / len(gnorms) * 100, 1),
             }
+            print(f"\n  📐 Gradient Norm Analysis:")
+            print(f"    Mean:          {analysis['grad_norm']['mean']:.4f}")
+            print(f"    Max:           {analysis['grad_norm']['max']:.4f}")
+            print(f"    Clipping rate: {analysis['grad_norm']['clipped_pct']:.1f}%")
             if analysis["grad_norm"]["clipped_pct"] > 30:
+                print(f"    ⚠️ Clipping is frequent → consider lowering LR or extending warmup")
+        # ── Throughput analysis ──
         if metrics_history.get("tokens_per_sec"):
             tps = metrics_history["tokens_per_sec"]
             tps_valid = [t for t in tps if t > 0]
                     "max": round(max(tps_valid)),
                 }
+                print(f"\n  ⚡ Throughput Analysis:")
+                print(f"    Mean:   {analysis['throughput']['mean']:,} tokens/sec")
+                print(f"    StdDev: {analysis['throughput']['std']:,}")
+                print(f"    Range:  [{analysis['throughput']['min']:,}, {analysis['throughput']['max']:,}]")
         return analysis
         metrics_history: Dict[str, list],
         save_path: Optional[str] = None,
     ):
+        """Visualizes training curves as a 4-panel chart."""
         if not HAS_MATPLOTLIB:
+            print("⚠️ matplotlib required: pip install matplotlib")
             return
         fig, axes = plt.subplots(2, 2, figsize=(16, 10))
                     metrics_history["train_loss"],
                     color="#2563eb", alpha=0.6, linewidth=0.8, label="Train Loss")
+            # Moving average (smoothing)
             if len(metrics_history["train_loss"]) > 20:
                 window = min(50, len(metrics_history["train_loss"]) // 5)
                 smoothed = self._moving_average(metrics_history["train_loss"], window)
         save_path = save_path or str(self.save_dir / "training_curves.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        print(f"\n  📊 Training curves saved: {save_path}")
         plt.close(fig)
     def plot_position_loss(
         position_losses: List[float],
         save_path: Optional[str] = None,
     ):
+        """Visualizes loss distribution by position."""
         if not HAS_MATPLOTLIB:
             return
         ax.set_title("Loss by Position (earlier positions have less context)", fontsize=13, fontweight="bold")
         ax.grid(True, alpha=0.3)
+        # Mark key regions
         if len(position_losses) > 100:
             early_avg = sum(position_losses[:50]) / 50
             late_avg = sum(position_losses[-200:]) / 200
         save_path = save_path or str(self.save_dir / "position_loss.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        print(f"  📊 Position loss saved: {save_path}")
         plt.close(fig)
     @staticmethod
     def _moving_average(data: list, window: int) -> list:
+        """Compute moving average."""
         result = []
         for i in range(window - 1, len(data)):
             avg = sum(data[i - window + 1 : i + 1]) / window

llm_lab/evaluation/full_evaluator.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""종합 평가 실행기."""
 import json
 import time
@@ -17,9 +17,9 @@ from .attention_viz import AttentionVisualizer
 class FullEvaluator:
-    """모든 평가를 한 번에 실행하고 리포트를 생성합니다.
-    사용법:
     ```python
     evaluator = FullEvaluator(model, tokenizer, val_dataloader, device)
     report = evaluator.run_full_evaluation()
@@ -48,24 +48,24 @@ class FullEvaluator:
         self.save_dir.mkdir(parents=True, exist_ok=True)
     def run_full_evaluation(self) -> Dict[str, Any]:
-        """전체 평가를 실행합니다."""
         report = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")}
         print("\n" + "=" * 70)
-        print("🔍 종합 평가 시작")
         print("=" * 70)
         # ── 1. Perplexity ──
         print("\n" + "━" * 40)
-        print("Phase 1/4: Perplexity 측정")
         print("━" * 40)
         ppl_evaluator = PerplexityEvaluator(self.config)
         report["perplexity"] = ppl_evaluator.evaluate(
             self.model, self.val_dataloader, self.device, self.dtype
         )
-        # 위치별 Loss
-        print("\n  위치별 Loss 측정 중...")
         position_losses = ppl_evaluator.evaluate_per_position(
             self.model, self.val_dataloader, self.device, self.dtype
         )
@@ -74,13 +74,13 @@ class FullEvaluator:
             "late_avg": round(sum(position_losses[-200:]) / max(len(position_losses[-200:]), 1), 4),
         }
-        # 위치별 Loss 시각화
         dynamics = TrainingDynamicsAnalyzer(str(self.save_dir))
         dynamics.plot_position_loss(position_losses, str(self.save_dir / "position_loss.png"))
-        # ── 2. 텍스트 생성 ──
         print("\n" + "━" * 40)
-        print("Phase 2/4: 텍스트 생성")
         print("━" * 40)
         gen_evaluator = GenerationEvaluator(self.config)
         gen_results = gen_evaluator.generate_samples(
@@ -91,52 +91,52 @@ class FullEvaluator:
             "avg_metrics": self._average_gen_metrics(gen_results),
         }
-        # ── 3. 학습 역학 분석 ──
         if self.metrics_history:
             print("\n" + "━" * 40)
-            print("Phase 3/4: 학습 역학 분석")
             print("━" * 40)
             report["training_dynamics"] = dynamics.analyze_metrics(self.metrics_history)
             dynamics.plot_training_curves(self.metrics_history,
                                           str(self.save_dir / "training_curves.png"))
         else:
-            print("\n  Phase 3/4: 건너뜀 (metrics_history 없음)")
-        # ── 4. Attention 시각화 (샘플) ──
         print("\n" + "━" * 40)
-        print("Phase 4/4: Attention 시각화")
         print("━" * 40)
         try:
             self._visualize_attention_sample()
         except Exception as e:
-            print(f"  ⚠️ Attention 시각화 실패: {e}")
-        # ── 리포트 저장 ──
         report_path = self.save_dir / "eval_report.json"
         with open(report_path, "w") as f:
             json.dump(report, f, indent=2, default=str)
-        print(f"\n📋 리포트 저장: {report_path}")
-        # ── 요약 출력 ──
         self._print_summary(report)
         return report
     def _visualize_attention_sample(self):
-        """샘플 텍스트로 attention을 시각화합니다."""
         viz = AttentionVisualizer(str(self.save_dir))
         sample_text = "The cat sat on the mat and looked at the bird."
         token_ids = self.tokenizer.encode(sample_text, add_special_tokens=False)
         input_tensor = torch.tensor([token_ids], dtype=torch.long)
-        # 토큰 문자열 (시각화 라벨용)
         tokens_str = []
         for tid in token_ids:
             decoded = self.tokenizer.decode([tid])
             tokens_str.append(decoded.replace("\n", "\\n"))
-        # Layer 0 attention 추출
         attn_weights = viz.extract_attention(
             self.model, input_tensor, layer_idx=0, device=self.device
         )
@@ -150,7 +150,7 @@ class FullEvaluator:
     @staticmethod
     def _average_gen_metrics(gen_results: List[Dict]) -> Dict[str, float]:
-        """모든 프롬프트의 생성 메트릭 평균."""
         if not gen_results:
             return {}
@@ -165,9 +165,9 @@ class FullEvaluator:
         }
     def _print_summary(self, report: Dict[str, Any]):
-        """최종 요약을 출력합니다."""
         print("\n" + "=" * 70)
-        print("📋 평가 요약 리포트")
         print("=" * 70)
         # Perplexity
@@ -177,44 +177,44 @@ class FullEvaluator:
             print(f"     Loss:       {ppl['loss']:.4f}")
             print(f"     PPL:        {ppl['perplexity']:.2f}")
-            # 등급 판정
             ppl_val = ppl["perplexity"]
             if ppl_val < 20:
-                grade = "🌟 우수 (Strong)"
             elif ppl_val < 35:
-                grade = "✅ 양호 (Good)"
             elif ppl_val < 60:
-                grade = "⚠️ 보통 (Fair)"
             else:
-                grade = "❌ 미흡 (학습 추가 필요)"
-            print(f"     등급:       {grade}")
-        # 위치별 Loss
         if "position_losses" in report:
             pl = report["position_losses"]
-            print(f"\n  📍 위치별 Loss:")
-            print(f"     초반 (0-50):    {pl['early_avg']:.4f}")
-            print(f"     후반 (-200):    {pl['late_avg']:.4f}")
-            print(f"     컨텍스트 효과:  {pl['early_avg'] - pl['late_avg']:.4f} 감소")
-        # 생성 품질
         if "generation" in report and report["generation"].get("avg_metrics"):
             gm = report["generation"]["avg_metrics"]
-            print(f"\n  ✍️ 생성 품질:")
-            print(f"     평균 길이:      {gm.get('avg_length', 0):.0f} 자")
-            print(f"     반복률:         {gm.get('repetition_rate', 0):.1%}")
-            print(f"     어휘 다양성:    {gm.get('lexical_diversity', 0):.3f}")
-        # 학습 역학
         if "training_dynamics" in report:
             td = report["training_dynamics"]
             if "loss" in td:
-                print(f"\n  📉 학습 역학:")
-                print(f"     Loss 감소:    {td['loss']['initial']:.4f} → {td['loss']['final']:.4f}")
-                print(f"     스파이크:     {len(td['loss']['spikes'])}회")
-        # 생성된 파일
-        print(f"\n  📂 결과 파일:")
         for f in sorted(self.save_dir.glob("*")):
             size = f.stat().st_size / 1024
             print(f"     {f.name} ({size:.1f} KB)")

+"""Comprehensive evaluation runner."""
 import json
 import time
 class FullEvaluator:
+    """Runs all evaluations at once and generates a report.
+    Usage:
     ```python
     evaluator = FullEvaluator(model, tokenizer, val_dataloader, device)
     report = evaluator.run_full_evaluation()
         self.save_dir.mkdir(parents=True, exist_ok=True)
     def run_full_evaluation(self) -> Dict[str, Any]:
+        """Runs the full evaluation."""
         report = {"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")}
         print("\n" + "=" * 70)
+        print("🔍 Starting comprehensive evaluation")
         print("=" * 70)
         # ── 1. Perplexity ──
         print("\n" + "━" * 40)
+        print("Phase 1/4: Perplexity measurement")
         print("━" * 40)
         ppl_evaluator = PerplexityEvaluator(self.config)
         report["perplexity"] = ppl_evaluator.evaluate(
             self.model, self.val_dataloader, self.device, self.dtype
         )
+        # Per-position loss
+        print("\n  Measuring per-position loss...")
         position_losses = ppl_evaluator.evaluate_per_position(
             self.model, self.val_dataloader, self.device, self.dtype
         )
             "late_avg": round(sum(position_losses[-200:]) / max(len(position_losses[-200:]), 1), 4),
         }
+        # Per-position loss visualization
         dynamics = TrainingDynamicsAnalyzer(str(self.save_dir))
         dynamics.plot_position_loss(position_losses, str(self.save_dir / "position_loss.png"))
+        # ── 2. Text generation ──
         print("\n" + "━" * 40)
+        print("Phase 2/4: Text generation")
         print("━" * 40)
         gen_evaluator = GenerationEvaluator(self.config)
         gen_results = gen_evaluator.generate_samples(
             "avg_metrics": self._average_gen_metrics(gen_results),
         }
+        # ── 3. Training dynamics analysis ──
         if self.metrics_history:
             print("\n" + "━" * 40)
+            print("Phase 3/4: Training dynamics analysis")
             print("━" * 40)
             report["training_dynamics"] = dynamics.analyze_metrics(self.metrics_history)
             dynamics.plot_training_curves(self.metrics_history,
                                           str(self.save_dir / "training_curves.png"))
         else:
+            print("\n  Phase 3/4: Skipped (no metrics_history)")
+        # ── 4. Attention visualization (sample) ──
         print("\n" + "━" * 40)
+        print("Phase 4/4: Attention visualization")
         print("━" * 40)
         try:
             self._visualize_attention_sample()
         except Exception as e:
+            print(f"  ⚠️ Attention visualization failed: {e}")
+        # ── Save report ──
         report_path = self.save_dir / "eval_report.json"
         with open(report_path, "w") as f:
             json.dump(report, f, indent=2, default=str)
+        print(f"\n📋 Report saved: {report_path}")
+        # ── Print summary ──
         self._print_summary(report)
         return report
     def _visualize_attention_sample(self):
+        """Visualizes attention using a sample text."""
         viz = AttentionVisualizer(str(self.save_dir))
         sample_text = "The cat sat on the mat and looked at the bird."
         token_ids = self.tokenizer.encode(sample_text, add_special_tokens=False)
         input_tensor = torch.tensor([token_ids], dtype=torch.long)
+        # Token strings (for visualization labels)
         tokens_str = []
         for tid in token_ids:
             decoded = self.tokenizer.decode([tid])
             tokens_str.append(decoded.replace("\n", "\\n"))
+        # Extract Layer 0 attention
         attn_weights = viz.extract_attention(
             self.model, input_tensor, layer_idx=0, device=self.device
         )
     @staticmethod
     def _average_gen_metrics(gen_results: List[Dict]) -> Dict[str, float]:
+        """Average generation metrics across all prompts."""
         if not gen_results:
             return {}
         }
     def _print_summary(self, report: Dict[str, Any]):
+        """Prints the final summary."""
         print("\n" + "=" * 70)
+        print("📋 Evaluation Summary Report")
         print("=" * 70)
         # Perplexity
             print(f"     Loss:       {ppl['loss']:.4f}")
             print(f"     PPL:        {ppl['perplexity']:.2f}")
+            # Grade assessment
             ppl_val = ppl["perplexity"]
             if ppl_val < 20:
+                grade = "🌟 Excellent (Strong)"
             elif ppl_val < 35:
+                grade = "✅ Good"
             elif ppl_val < 60:
+                grade = "⚠️ Fair"
             else:
+                grade = "❌ Poor (more training needed)"
+            print(f"     Grade:      {grade}")
+        # Per-position loss
         if "position_losses" in report:
             pl = report["position_losses"]
+            print(f"\n  📍 Per-position Loss:")
+            print(f"     Early (0-50):    {pl['early_avg']:.4f}")
+            print(f"     Late (-200):     {pl['late_avg']:.4f}")
+            print(f"     Context effect:  {pl['early_avg'] - pl['late_avg']:.4f} reduction")
+        # Generation quality
         if "generation" in report and report["generation"].get("avg_metrics"):
             gm = report["generation"]["avg_metrics"]
+            print(f"\n  ✍️ Generation Quality:")
+            print(f"     Avg length:         {gm.get('avg_length', 0):.0f} chars")
+            print(f"     Repetition rate:    {gm.get('repetition_rate', 0):.1%}")
+            print(f"     Lexical diversity:  {gm.get('lexical_diversity', 0):.3f}")
+        # Training dynamics
         if "training_dynamics" in report:
             td = report["training_dynamics"]
             if "loss" in td:
+                print(f"\n  📉 Training Dynamics:")
+                print(f"     Loss reduction: {td['loss']['initial']:.4f} → {td['loss']['final']:.4f}")
+                print(f"     Spikes:         {len(td['loss']['spikes'])}")
+        # Generated files
+        print(f"\n  📂 Output files:")
         for f in sorted(self.save_dir.glob("*")):
             size = f.stat().st_size / 1024
             print(f"     {f.name} ({size:.1f} KB)")

llm_lab/evaluation/generation.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""텍스트 생성 평가기."""
 from typing import Any, Dict, List, Optional
@@ -9,47 +9,47 @@ from llm_lab.config import EvalConfig
 class GenerationEvaluator:
-    """다양한 프롬프트로 텍스트를 생성하여 품질을 평가합니다.
-    평가 관점:
-      1) 문법적 정확성:  영어 문법에 맞는 문장을 생성하는가?
-      2) 일관성:         문맥을 유지하며 이어가는가?
-      3) 다양성:         같은 프롬프트에 다른 결과를 생성하는가?
-      4) 반복 회피:      같은 구절을 반복하지 않는가?
-      5) 지식 표현:      학습 데이터의 지식이 반영되는가?
-    1B 모델의 현실적 기대치:
-      - 문법적으로 올바른 영어 문장 생성 ✅
-      - 짧은 문단 내 일관성 유지 ✅
-      - 복잡한 추론이나 긴 논리 전개 ❌ (더 큰 모델 필요)
-      - 사실적 정확성은 보장 안 됨 ⚠️
     """
-    # 다양한 도메인의 테스트 프롬프트
     DEFAULT_PROMPTS = [
-        # ── 일반 지식 ──
         "The theory of relativity states that",
         "In the history of computer science,",
         "The human brain is remarkable because",
-        # ── 설명/교육 ──
         "To understand machine learning, one must first",
         "The water cycle begins when",
         "Photosynthesis is the process by which",
-        # ── 서사/스토리 ──
         "Once upon a time, in a small village near the mountains,",
         "The detective looked at the evidence and realized that",
-        # ── 코드/기술 ──
         "def fibonacci(n):\n    \"\"\"Calculate the nth Fibonacci number.\"\"\"\n",
         "The most important data structures in programming are",
-        # ── 짧은 완성 ──
         "The capital of France is",
         "Water boils at a temperature of",
-        # ── 긴 문맥 ──
         ("Artificial intelligence has transformed many industries. "
          "In healthcare, AI is used for diagnosis and drug discovery. "
          "In finance, it powers algorithmic trading and fraud detection. "
@@ -68,7 +68,7 @@ class GenerationEvaluator:
         prompts: Optional[List[str]] = None,
         verbose: bool = True,
     ) -> List[Dict[str, Any]]:
-        """프롬프트별로 텍스트를 생성합니다.
         Returns:
             [{"prompt": str, "generations": [str, ...], "metrics": {...}}, ...]
@@ -79,7 +79,7 @@ class GenerationEvaluator:
         if verbose:
             print("\n" + "=" * 70)
-            print("📝 텍스트 생성 평가")
             print("=" * 70)
         for idx, prompt in enumerate(prompts):
@@ -91,17 +91,17 @@ class GenerationEvaluator:
             if verbose:
                 print(f"\n{'─'*60}")
-                print(f"프롬프트 [{idx+1}/{len(prompts)}]:")
                 print(f"  \"{prompt[:80]}{'...' if len(prompt) > 80 else ''}\"")
                 print(f"{'─'*60}")
-            # 프롬프트 인코딩
             prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
             input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
             all_texts = []
             for sample_idx in range(self.config.num_samples):
-                # 생성
                 generated_ids = model.generate(
                     input_tensor,
                     max_new_tokens=self.config.max_new_tokens,
@@ -110,7 +110,7 @@ class GenerationEvaluator:
                     top_p=self.config.top_p,
                 )
-                # 디코딩 (프롬프트 이후 부분만)
                 new_ids = generated_ids[0][len(prompt_ids):].tolist()
                 generated_text = tokenizer.decode(new_ids)
                 all_texts.append(generated_text)
@@ -118,23 +118,23 @@ class GenerationEvaluator:
                 prompt_results["generations"].append(generated_text)
                 if verbose:
-                    print(f"\n  ✍️ 생성 #{sample_idx+1}:")
-                    # 깔끔한 출력 (줄바꿈 포함)
                     display_text = generated_text[:500]
                     for line in display_text.split("\n"):
                         print(f"    {line}")
                     if len(generated_text) > 500:
-                        print(f"    ... (총 {len(generated_text)} 문자)")
-            # 생성 품질 메트릭
             prompt_results["metrics"] = self._compute_generation_metrics(all_texts)
             if verbose and prompt_results["metrics"]:
                 m = prompt_results["metrics"]
-                print(f"\n  📊 메트릭: "
-                      f"평균 길이={m['avg_length']:.0f}자, "
-                      f"반복률={m['repetition_rate']:.1%}, "
-                      f"어휘 다양성={m['lexical_diversity']:.2f}")
             results.append(prompt_results)
@@ -142,23 +142,23 @@ class GenerationEvaluator:
     @staticmethod
     def _compute_generation_metrics(texts: List[str]) -> Dict[str, float]:
-        """생성 텍스트의 품질 메트릭을 계산합니다.
-        메트릭:
-          - avg_length:        평균 생성 길이 (문자)
-          - avg_word_count:    평균 단어 수
-          - repetition_rate:   n-gram 반복률 (낮을수록 좋음)
-          - lexical_diversity: 고유 단어 비율 (높을수록 다양)
-          - sample_diversity:  샘플 간 다양성 (다른 생성끼리 얼마나 다른가)
         """
         if not texts:
             return {}
-        # 길이
         lengths = [len(t) for t in texts]
         word_counts = [len(t.split()) for t in texts]
-        # 반복률 (4-gram 기준)
         rep_rates = []
         for text in texts:
             words = text.lower().split()
@@ -167,9 +167,9 @@ class GenerationEvaluator:
                 continue
             ngrams = [tuple(words[i:i+4]) for i in range(len(words)-3)]
             unique_ratio = len(set(ngrams)) / len(ngrams) if ngrams else 1.0
-            rep_rates.append(1.0 - unique_ratio)  # 반복률 = 1 - 고유비율
-        # 어휘 다양성 (Type-Token Ratio)
         diversities = []
         for text in texts:
             words = text.lower().split()
@@ -178,7 +178,7 @@ class GenerationEvaluator:
             else:
                 diversities.append(0.0)
-        # 샘플 간 다양성 (자카드 유사도의 역)
         sample_div = 0.0
         if len(texts) > 1:
             word_sets = [set(t.lower().split()) for t in texts]

+"""Text generation evaluator."""
 from typing import Any, Dict, List, Optional
 class GenerationEvaluator:
+    """Evaluates text quality by generating from various prompts.
+    Evaluation perspectives:
+      1) Grammatical accuracy:  Does it generate grammatically correct English sentences?
+      2) Coherence:             Does it maintain context continuity?
+      3) Diversity:             Does it produce different outputs for the same prompt?
+      4) Repetition avoidance:  Does it avoid repeating the same phrases?
+      5) Knowledge expression:  Is knowledge from the training data reflected?
+    Realistic expectations for a 1B model:
+      - Generates grammatically correct English sentences ✅
+      - Maintains coherence within short paragraphs ✅
+      - Complex reasoning or extended logical chains ❌ (requires a larger model)
+      - Factual accuracy is not guaranteed ⚠️
     """
+    # Test prompts from various domains
     DEFAULT_PROMPTS = [
+        # ── General knowledge ──
         "The theory of relativity states that",
         "In the history of computer science,",
         "The human brain is remarkable because",
+        # ── Explanation / Education ──
         "To understand machine learning, one must first",
         "The water cycle begins when",
         "Photosynthesis is the process by which",
+        # ── Narrative / Story ──
         "Once upon a time, in a small village near the mountains,",
         "The detective looked at the evidence and realized that",
+        # ── Code / Technical ──
         "def fibonacci(n):\n    \"\"\"Calculate the nth Fibonacci number.\"\"\"\n",
         "The most important data structures in programming are",
+        # ── Short completion ──
         "The capital of France is",
         "Water boils at a temperature of",
+        # ── Long context ──
         ("Artificial intelligence has transformed many industries. "
          "In healthcare, AI is used for diagnosis and drug discovery. "
          "In finance, it powers algorithmic trading and fraud detection. "
         prompts: Optional[List[str]] = None,
         verbose: bool = True,
     ) -> List[Dict[str, Any]]:
+        """Generates text for each prompt.
         Returns:
             [{"prompt": str, "generations": [str, ...], "metrics": {...}}, ...]
         if verbose:
             print("\n" + "=" * 70)
+            print("📝 Text Generation Evaluation")
             print("=" * 70)
         for idx, prompt in enumerate(prompts):
             if verbose:
                 print(f"\n{'─'*60}")
+                print(f"Prompt [{idx+1}/{len(prompts)}]:")
                 print(f"  \"{prompt[:80]}{'...' if len(prompt) > 80 else ''}\"")
                 print(f"{'─'*60}")
+            # Encode prompt
             prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
             input_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
             all_texts = []
             for sample_idx in range(self.config.num_samples):
+                # Generate
                 generated_ids = model.generate(
                     input_tensor,
                     max_new_tokens=self.config.max_new_tokens,
                     top_p=self.config.top_p,
                 )
+                # Decode (only the part after the prompt)
                 new_ids = generated_ids[0][len(prompt_ids):].tolist()
                 generated_text = tokenizer.decode(new_ids)
                 all_texts.append(generated_text)
                 prompt_results["generations"].append(generated_text)
                 if verbose:
+                    print(f"\n  ✍️ Generation #{sample_idx+1}:")
+                    # Clean output (including newlines)
                     display_text = generated_text[:500]
                     for line in display_text.split("\n"):
                         print(f"    {line}")
                     if len(generated_text) > 500:
+                        print(f"    ... (total {len(generated_text)} characters)")
+            # Generation quality metrics
             prompt_results["metrics"] = self._compute_generation_metrics(all_texts)
             if verbose and prompt_results["metrics"]:
                 m = prompt_results["metrics"]
+                print(f"\n  📊 Metrics: "
+                      f"avg_length={m['avg_length']:.0f} chars, "
+                      f"repetition_rate={m['repetition_rate']:.1%}, "
+                      f"lexical_diversity={m['lexical_diversity']:.2f}")
             results.append(prompt_results)
     @staticmethod
     def _compute_generation_metrics(texts: List[str]) -> Dict[str, float]:
+        """Computes quality metrics for generated text.
+        Metrics:
+          - avg_length:        Average generation length (characters)
+          - avg_word_count:    Average word count
+          - repetition_rate:   n-gram repetition rate (lower is better)
+          - lexical_diversity: Ratio of unique words (higher means more diverse)
+          - sample_diversity:  Diversity across samples (how different are different generations)
         """
         if not texts:
             return {}
+        # Length
         lengths = [len(t) for t in texts]
         word_counts = [len(t.split()) for t in texts]
+        # Repetition rate (based on 4-grams)
         rep_rates = []
         for text in texts:
             words = text.lower().split()
                 continue
             ngrams = [tuple(words[i:i+4]) for i in range(len(words)-3)]
             unique_ratio = len(set(ngrams)) / len(ngrams) if ngrams else 1.0
+            rep_rates.append(1.0 - unique_ratio)  # repetition rate = 1 - unique ratio
+        # Lexical diversity (Type-Token Ratio)
         diversities = []
         for text in texts:
             words = text.lower().split()
             else:
                 diversities.append(0.0)
+        # Inter-sample diversity (inverse of Jaccard similarity)
         sample_div = 0.0
         if len(texts) > 1:
             word_sets = [set(t.lower().split()) for t in texts]

llm_lab/evaluation/perplexity.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Perplexity(PPL) 평가기."""
 import math
 import time
@@ -13,26 +13,26 @@ from llm_lab.config import EvalConfig
 class PerplexityEvaluator:
-    """Perplexity(PPL)를 측정합니다.
-    Perplexity란?
       PPL = exp(average cross-entropy loss)
-      직관적 의미:
-        - PPL = 1:     완벽한 예측 (불가능)
-        - PPL = 10:    매번 10개 후보 중 고르는 수준
-        - PPL = 100:   100개 후보 중 고르는 수준 (무작위에 가까움)
-        - PPL = 32000: vocab 전체에서 랜덤 선택 (초기 랜덤 모델)
-      좋은 1B 모델 기준 (영어 웹 텍스트):
-        - 5B 토큰 학습: PPL ~30-40
-        - 10B 토큰 학습: PPL ~20-30
-        - 20B 토큰 학습: PPL ~15-25
-    측정 방법:
-      - 검증 데이터셋의 모든 토큰에 대해 cross-entropy 계산
-      - 토큰 단위 평균 후 exp() 적용
-      - 패딩 토큰은 제외 (ignore_index=-100)
     """
     def __init__(self, config: EvalConfig):
@@ -47,14 +47,14 @@ class PerplexityEvaluator:
         dtype: torch.dtype = torch.bfloat16,
         desc: str = "Evaluation",
     ) -> Dict[str, float]:
-        """Perplexity를 측정합니다.
         Returns:
             {
-                "loss": 평균 cross-entropy loss,
                 "perplexity": exp(loss),
-                "num_tokens": 평가에 사용된 총 토큰 수,
-                "num_batches": 평가에 사용된 배치 수,
             }
         """
         model.eval()
@@ -76,7 +76,7 @@ class PerplexityEvaluator:
             with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
                 logits, _ = model(input_ids)
-            # 토큰별 cross-entropy (reduction='none')
             # logits: (B, S, V) → (B*S, V)
             # targets: (B, S) → (B*S,)
             loss_per_token = F.cross_entropy(
@@ -86,7 +86,7 @@ class PerplexityEvaluator:
                 reduction="none",
             )
-            # -100이 아닌 유효 토큰만 카운트
             valid_mask = (targets.view(-1) != -100)
             valid_tokens = valid_mask.sum().item()
@@ -100,7 +100,7 @@ class PerplexityEvaluator:
         elapsed = time.time() - start_time
         avg_loss = total_loss / max(total_tokens, 1)
-        perplexity = math.exp(min(avg_loss, 100))  # overflow 방지
         results = {
             "loss": round(avg_loss, 4),
@@ -113,8 +113,8 @@ class PerplexityEvaluator:
         print(f"  ────────────────────────────────")
         print(f"  Loss:        {results['loss']:.4f}")
         print(f"  Perplexity:  {results['perplexity']:.2f}")
-        print(f"  평가 토큰:   {total_tokens:,}")
-        print(f"  소요 시간:   {elapsed:.1f}초")
         return results
@@ -127,12 +127,12 @@ class PerplexityEvaluator:
         dtype: torch.dtype = torch.bfloat16,
         max_batches: int = 50,
     ) -> List[float]:
-        """시퀀스 내 위치별 Loss를 측정합니다.
-        학습 포인트:
-          - 위치 0~10: Loss가 높음 (문맥이 부족)
-          - 위치 100+: Loss가 안정적으로 낮아짐 (문맥 활용)
-          - 이 패턴이 Transformer의 in-context learning 능력을 보여줌
         """
         model.eval()
         seq_len = None
@@ -155,7 +155,7 @@ class PerplexityEvaluator:
             with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
                 logits, _ = model(input_ids)
-            # (B, S) 형태의 토큰별 loss
             loss_per_token = F.cross_entropy(
                 logits.view(-1, logits.size(-1)),
                 targets.view(-1),
@@ -167,6 +167,6 @@ class PerplexityEvaluator:
             position_loss_sum += (loss_per_token * valid_mask).sum(dim=0)
             position_count += valid_mask.sum(dim=0)
-        # 위치별 평균 loss
         position_avg_loss = (position_loss_sum / position_count.clamp(min=1)).cpu().tolist()
         return position_avg_loss

+"""Perplexity (PPL) evaluator."""
 import math
 import time
 class PerplexityEvaluator:
+    """Measures Perplexity (PPL).
+    What is Perplexity?
       PPL = exp(average cross-entropy loss)
+      Intuitive meaning:
+        - PPL = 1:     Perfect prediction (impossible)
+        - PPL = 10:    Equivalent to picking from 10 candidates each time
+        - PPL = 100:   Equivalent to picking from 100 candidates (close to random)
+        - PPL = 32000: Random selection from the entire vocab (initial random model)
+      Good benchmark for a 1B model (English web text):
+        - Trained on 5B tokens:  PPL ~30-40
+        - Trained on 10B tokens: PPL ~20-30
+        - Trained on 20B tokens: PPL ~15-25
+    Measurement method:
+      - Compute cross-entropy over all tokens in the validation dataset
+      - Average per token, then apply exp()
+      - Padding tokens are excluded (ignore_index=-100)
     """
     def __init__(self, config: EvalConfig):
         dtype: torch.dtype = torch.bfloat16,
         desc: str = "Evaluation",
     ) -> Dict[str, float]:
+        """Measures Perplexity.
         Returns:
             {
+                "loss": average cross-entropy loss,
                 "perplexity": exp(loss),
+                "num_tokens": total number of tokens used for evaluation,
+                "num_batches": number of batches used for evaluation,
             }
         """
         model.eval()
             with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
                 logits, _ = model(input_ids)
+            # Per-token cross-entropy (reduction='none')
             # logits: (B, S, V) → (B*S, V)
             # targets: (B, S) → (B*S,)
             loss_per_token = F.cross_entropy(
                 reduction="none",
             )
+            # Count only valid tokens that are not -100
             valid_mask = (targets.view(-1) != -100)
             valid_tokens = valid_mask.sum().item()
         elapsed = time.time() - start_time
         avg_loss = total_loss / max(total_tokens, 1)
+        perplexity = math.exp(min(avg_loss, 100))  # prevent overflow
         results = {
             "loss": round(avg_loss, 4),
         print(f"  ────────────────────────────────")
         print(f"  Loss:        {results['loss']:.4f}")
         print(f"  Perplexity:  {results['perplexity']:.2f}")
+        print(f"  Eval tokens: {total_tokens:,}")
+        print(f"  Elapsed:     {elapsed:.1f}s")
         return results
         dtype: torch.dtype = torch.bfloat16,
         max_batches: int = 50,
     ) -> List[float]:
+        """Measures loss per position within a sequence.
+        Learning insight:
+          - Positions 0~10: Higher loss (insufficient context)
+          - Positions 100+: Loss stabilizes lower (context is leveraged)
+          - This pattern demonstrates the Transformer's in-context learning capability
         """
         model.eval()
         seq_len = None
             with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
                 logits, _ = model(input_ids)
+            # Per-token loss in shape (B, S)
             loss_per_token = F.cross_entropy(
                 logits.view(-1, logits.size(-1)),
                 targets.view(-1),
             position_loss_sum += (loss_per_token * valid_mask).sum(dim=0)
             position_count += valid_mask.sum(dim=0)
+        # Average loss per position
         position_avg_loss = (position_loss_sum / position_count.clamp(min=1)).cpu().tolist()
         return position_avg_loss

llm_lab/evaluation/runner.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""평가 실행 헬퍼 (Quick Start)."""
 from typing import Any, Dict, Optional
@@ -20,13 +20,13 @@ def run_evaluation(
     metrics_history: Optional[Dict[str, list]] = None,
     config: Optional[EvalConfig] = None,
 ) -> Dict[str, Any]:
-    """평가를 한 번에 실행합니다.
-    사용법 (Colab):
     ```python
     from llm_lab.evaluation import run_evaluation
-    # 학습 완료 후
     report = run_evaluation(
         model=trainer.model,
         tokenizer=tokenizer,
@@ -50,7 +50,7 @@ def run_evaluation(
     report = evaluator.run_full_evaluation()
-    # 인사이트 체크리스트
     InsightChecklist.run_checklist(report, metrics_history)
     return report

+"""Evaluation runner helper (Quick Start)."""
 from typing import Any, Dict, Optional
     metrics_history: Optional[Dict[str, list]] = None,
     config: Optional[EvalConfig] = None,
 ) -> Dict[str, Any]:
+    """Runs all evaluations in one call.
+    Usage (Colab):
     ```python
     from llm_lab.evaluation import run_evaluation
+    # After training is complete
     report = run_evaluation(
         model=trainer.model,
         tokenizer=tokenizer,
     report = evaluator.run_full_evaluation()
+    # Insight checklist
     InsightChecklist.run_checklist(report, metrics_history)
     return report

llm_lab/evaluation/scaling.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Scaling Law 분석기."""
 from pathlib import Path
 from typing import Any, Dict, List, Optional
@@ -19,17 +19,17 @@ except ImportError:
 class ScalingAnalyzer:
-    """10M → 100M → 1B 모델의 Scaling Law를 분석합니다.
     Chinchilla Scaling Law (2022):
-      - 최적 학습: 토큰 수 ≈ 20 × 파라미터 수
-      - Loss ∝ N^(-α) × D^(-β)  (N=파라미터, D=데이터)
-      - α ≈ 0.076, β ≈ 0.095 (논문 기준)
-    이 분석의 목적:
-      - 우리 모델이 Scaling Law를 따르는지 확인
-      - 더 큰 모델/더 많은 데이터의 효과를 예측
-      - 컴퓨팅 자원 배분의 최적점 이해
     """
     def __init__(self, save_dir: str = "./eval_results"):
@@ -40,7 +40,7 @@ class ScalingAnalyzer:
         self,
         model_results: List[Dict[str, Any]],
     ) -> Dict[str, Any]:
-        """여러 모델 크기의 결과를 비교 분석합니다.
         Args:
             model_results: [
@@ -50,25 +50,25 @@ class ScalingAnalyzer:
             ]
         Returns:
-            분석 결과 딕셔너리
         """
         if len(model_results) < 2:
-            print("⚠️ Scaling 분석에는 최소 2개 모델 결과가 필요합니다.")
             return {}
         print("\n" + "=" * 70)
-        print("📈 Scaling Law 분석")
         print("=" * 70)
-        # ── 결과 테이블 ──
-        print(f"\n  {'모델':<8} {'파라미터':>12} {'토큰':>10} {'Loss':>8} {'PPL':>8}")
         print(f"  {'─'*52}")
         for r in model_results:
             params_str = f"{r['params']/1e6:.0f}M" if r["params"] < 1e9 else f"{r['params']/1e9:.1f}B"
             tokens_str = f"{r['tokens']/1e9:.1f}B"
             print(f"  {r['name']:<8} {params_str:>12} {tokens_str:>10} {r['loss']:>8.4f} {r['ppl']:>8.2f}")
-        # ── Scaling 효율 계산 ──
         analysis = {"models": model_results, "scaling_efficiency": []}
         for i in range(1, len(model_results)):
@@ -89,17 +89,17 @@ class ScalingAnalyzer:
             analysis["scaling_efficiency"].append(efficiency)
             print(f"\n  {prev['name']} → {curr['name']}:")
-            print(f"    파라미터 ×{param_ratio:.1f}")
-            print(f"    Loss 감소: {loss_reduction:.4f}")
-            print(f"    PPL 감소: {ppl_reduction*100:.1f}%")
-        # ── Chinchilla 최적성 체크 ──
-        print(f"\n  Chinchilla 최적성 체크 (토큰 ≈ 20 × 파라미터):")
         for r in model_results:
             actual_ratio = r["tokens"] / r["params"]
-            status = "✅ 최적 범위" if 15 <= actual_ratio <= 25 else "⚠️ 범위 밖"
-            print(f"    {r['name']}: 토큰/파라미터 = {actual_ratio:.1f}x "
-                  f"(최적: 20x) {status}")
         analysis["chinchilla_ratios"] = [
             {"name": r["name"], "ratio": round(r["tokens"] / r["params"], 1)}
@@ -113,9 +113,9 @@ class ScalingAnalyzer:
         model_results: List[Dict[str, Any]],
         save_path: Optional[str] = None,
     ):
-        """Scaling 곡선을 시각화합니다."""
         if not HAS_MATPLOTLIB or not HAS_NUMPY:
-            print("⚠️ matplotlib/numpy가 필요합니다: pip install matplotlib numpy")
             return
         fig, axes = plt.subplots(1, 2, figsize=(14, 5))
@@ -149,5 +149,5 @@ class ScalingAnalyzer:
         save_path = save_path or str(self.save_dir / "scaling_curves.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
-        print(f"\n  📊 Scaling 곡선 저장: {save_path}")
         plt.close(fig)

+"""Scaling Law analyzer."""
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 class ScalingAnalyzer:
+    """Analyzes Scaling Law across 10M → 100M → 1B models.
     Chinchilla Scaling Law (2022):
+      - Optimal training: tokens ≈ 20 × number of parameters
+      - Loss ∝ N^(-α) × D^(-β)  (N=parameters, D=data)
+      - α ≈ 0.076, β ≈ 0.095 (per the paper)
+    Purpose of this analysis:
+      - Verify whether our model follows the Scaling Law
+      - Predict the effect of larger models / more data
+      - Understand the optimal allocation of compute resources
     """
     def __init__(self, save_dir: str = "./eval_results"):
         self,
         model_results: List[Dict[str, Any]],
     ) -> Dict[str, Any]:
+        """Comparatively analyzes results across multiple model sizes.
         Args:
             model_results: [
             ]
         Returns:
+            Analysis result dictionary
         """
         if len(model_results) < 2:
+            print("⚠️ Scaling analysis requires results from at least 2 models.")
             return {}
         print("\n" + "=" * 70)
+        print("📈 Scaling Law Analysis")
         print("=" * 70)
+        # ── Results table ──
+        print(f"\n  {'Model':<8} {'Parameters':>12} {'Tokens':>10} {'Loss':>8} {'PPL':>8}")
         print(f"  {'─'*52}")
         for r in model_results:
             params_str = f"{r['params']/1e6:.0f}M" if r["params"] < 1e9 else f"{r['params']/1e9:.1f}B"
             tokens_str = f"{r['tokens']/1e9:.1f}B"
             print(f"  {r['name']:<8} {params_str:>12} {tokens_str:>10} {r['loss']:>8.4f} {r['ppl']:>8.2f}")
+        # ── Scaling efficiency calculation ──
         analysis = {"models": model_results, "scaling_efficiency": []}
         for i in range(1, len(model_results)):
             analysis["scaling_efficiency"].append(efficiency)
             print(f"\n  {prev['name']} → {curr['name']}:")
+            print(f"    Parameters ×{param_ratio:.1f}")
+            print(f"    Loss reduction: {loss_reduction:.4f}")
+            print(f"    PPL reduction: {ppl_reduction*100:.1f}%")
+        # ── Chinchilla optimality check ──
+        print(f"\n  Chinchilla optimality check (tokens ≈ 20 × parameters):")
         for r in model_results:
             actual_ratio = r["tokens"] / r["params"]
+            status = "✅ Optimal range" if 15 <= actual_ratio <= 25 else "⚠️ Out of range"
+            print(f"    {r['name']}: tokens/parameters = {actual_ratio:.1f}x "
+                  f"(optimal: 20x) {status}")
         analysis["chinchilla_ratios"] = [
             {"name": r["name"], "ratio": round(r["tokens"] / r["params"], 1)}
         model_results: List[Dict[str, Any]],
         save_path: Optional[str] = None,
     ):
+        """Visualizes scaling curves."""
         if not HAS_MATPLOTLIB or not HAS_NUMPY:
+            print("⚠️ matplotlib/numpy required: pip install matplotlib numpy")
             return
         fig, axes = plt.subplots(1, 2, figsize=(14, 5))
         save_path = save_path or str(self.save_dir / "scaling_curves.png")
         fig.savefig(save_path, dpi=150, bbox_inches="tight")
+        print(f"\n  📊 Scaling curves saved: {save_path}")
         plt.close(fig)

llm_lab/model/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""모델 아키텍처 모듈 — LLaMA-style Decoder-Only Transformer."""
 from .norm import RMSNorm
 from .rope import RotaryPositionalEmbedding
 from .attention import GroupedQueryAttention

+"""Model architecture module — LLaMA-style Decoder-Only Transformer."""
 from .norm import RMSNorm
 from .rope import RotaryPositionalEmbedding
 from .attention import GroupedQueryAttention

llm_lab/model/attention.py CHANGED Viewed

@@ -11,20 +11,20 @@ from .rope import RotaryPositionalEmbedding
 class GroupedQueryAttention(nn.Module):
-    """GQA: Multi-Head Attention의 메모리 효율적 변형.
     MHA vs GQA vs MQA:
-      - MHA (Multi-Head Attention):  Q, K, V 모두 num_heads개 → 메모리 큼
-      - MQA (Multi-Query Attention): K, V는 1개 헤드 공유 → 품질 저하 우려
-      - GQA (Grouped Query Attention): K, V를 num_kv_heads개로 그룹화
-        → MHA와 MQA의 중간, 좋은 품질-효율 균형
-    예시 (num_heads=16, num_kv_heads=4):
-      Q 헤드: [0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15]
-      K/V 그룹:  [  0  ,   1   ,    2     ,     3      ]
-      → Q 헤드 4개가 K/V 헤드 1개를 공유
-    Attention 수식:
       Attention(Q, K, V) = softmax(Q·K^T / √d_k) · V
     """
@@ -36,14 +36,14 @@ class GroupedQueryAttention(nn.Module):
         self.num_kv_heads = config.num_kv_heads
         self.num_kv_groups = config.num_kv_groups  # num_heads // num_kv_heads
-        # Q/K/V 프로젝션
         # Q: hidden_dim → num_heads × head_dim
         self.q_proj = nn.Linear(config.hidden_dim, config.num_heads * self.head_dim, bias=False)
-        # K, V: hidden_dim → num_kv_heads × head_dim (Q보다 작음!)
         self.k_proj = nn.Linear(config.hidden_dim, config.num_kv_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(config.hidden_dim, config.num_kv_heads * self.head_dim, bias=False)
-        # 출력 프로젝션: 모든 헤드의 출력을 다시 hidden_dim으로
         self.o_proj = nn.Linear(config.num_heads * self.head_dim, config.hidden_dim, bias=False)
         # RoPE
@@ -51,7 +51,7 @@ class GroupedQueryAttention(nn.Module):
             dim=self.head_dim, max_seq_len=config.max_seq_len, theta=config.rope_theta
         )
-        # Attention dropout (pretraining에서는 보통 0)
         self.attn_dropout = nn.Dropout(config.dropout)
     def forward(
@@ -64,7 +64,7 @@ class GroupedQueryAttention(nn.Module):
         Args:
             x: (batch_size, seq_len, hidden_dim)
             mask: (seq_len, seq_len) causal mask
-            position_offset: 위치 오프셋 (추론 시 사용)
         Returns:
             (batch_size, seq_len, hidden_dim)
@@ -72,13 +72,13 @@ class GroupedQueryAttention(nn.Module):
         B, S, _ = x.shape
         # ──────────────────────────────────────────────
-        # Step 1: Q, K, V 프로젝션
         # ──────────────────────────────────────────────
         q = self.q_proj(x)  # (B, S, num_heads × head_dim)
         k = self.k_proj(x)  # (B, S, num_kv_heads × head_dim)
         v = self.v_proj(x)  # (B, S, num_kv_heads × head_dim)
-        # 멀티헤드 형태로 reshape
         q = q.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
         # → (B, num_heads, S, head_dim)
         k = k.view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
@@ -86,16 +86,16 @@ class GroupedQueryAttention(nn.Module):
         v = v.view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
         # ──────────────────────────────────────────────
-        # Step 2: RoPE 적용 (Q, K에만! V에는 적용하지 않음)
         # ──────────────────────────────────────────────
-        # 위치 정보는 "어디를 볼지"(Q·K)에만 영향을 줘야 하고,
-        # "무엇을 가져올지"(V)에는 영향을 주면 안 됩니다.
         q, k = self.rope(q, k, position_offset)
         # ──────────────────────────────────────────────
-        # Step 3: GQA - KV 헤드 확장 (repeat)
         # ──────────────────────────────────────────────
-        # num_kv_heads=4 → num_heads=16: 각 KV를 4번 반복
         if self.num_kv_groups > 1:
             k = self._repeat_kv(k)  # (B, num_heads, S, head_dim)
             v = self._repeat_kv(v)
@@ -103,17 +103,17 @@ class GroupedQueryAttention(nn.Module):
         # ──────────────────────────────────────────────
         # Step 4: Scaled Dot-Product Attention
         # ──────────────────────────────────────────────
-        # PyTorch >= 2.0의 최적화된 구현 사용 (Flash Attention 자동 적용)
         attn_out = F.scaled_dot_product_attention(
             q, k, v,
             attn_mask=mask,
             dropout_p=self.config.dropout if self.training else 0.0,
-            is_causal=(mask is None),  # mask가 없으면 자동 causal masking
         )
         # → (B, num_heads, S, head_dim)
         # ──────────────────────────────────────────────
-        # Step 5: 헤드 합치기 + 출력 프로젝션
         # ──────────────────────────────────────────────
         attn_out = attn_out.transpose(1, 2).contiguous().view(B, S, -1)
         # → (B, S, num_heads × head_dim)
@@ -121,11 +121,11 @@ class GroupedQueryAttention(nn.Module):
         return self.o_proj(attn_out)  # → (B, S, hidden_dim)
     def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
-        """KV 헤드를 Q 헤드 수에 맞게 반복합니다.
         (B, num_kv_heads, S, head_dim) → (B, num_heads, S, head_dim)
-        예: num_kv_heads=4, num_kv_groups=4
           [kv0, kv1, kv2, kv3] → [kv0,kv0,kv0,kv0, kv1,kv1,kv1,kv1, ...]
         """
         B, H_kv, S, D = x.shape

 class GroupedQueryAttention(nn.Module):
+    """GQA: A memory-efficient variant of Multi-Head Attention.
     MHA vs GQA vs MQA:
+      - MHA (Multi-Head Attention):  Q, K, V all have num_heads → high memory usage
+      - MQA (Multi-Query Attention): K, V share a single head → risk of quality degradation
+      - GQA (Grouped Query Attention): K, V are grouped into num_kv_heads
+        → a middle ground between MHA and MQA, good quality-efficiency balance
+    Example (num_heads=16, num_kv_heads=4):
+      Q heads: [0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15]
+      K/V groups:  [  0  ,   1   ,    2     ,     3      ]
+      → 4 Q heads share 1 K/V head
+    Attention formula:
       Attention(Q, K, V) = softmax(Q·K^T / √d_k) · V
     """
         self.num_kv_heads = config.num_kv_heads
         self.num_kv_groups = config.num_kv_groups  # num_heads // num_kv_heads
+        # Q/K/V projections
         # Q: hidden_dim → num_heads × head_dim
         self.q_proj = nn.Linear(config.hidden_dim, config.num_heads * self.head_dim, bias=False)
+        # K, V: hidden_dim → num_kv_heads × head_dim (smaller than Q!)
         self.k_proj = nn.Linear(config.hidden_dim, config.num_kv_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(config.hidden_dim, config.num_kv_heads * self.head_dim, bias=False)
+        # Output projection: merge all head outputs back to hidden_dim
         self.o_proj = nn.Linear(config.num_heads * self.head_dim, config.hidden_dim, bias=False)
         # RoPE
             dim=self.head_dim, max_seq_len=config.max_seq_len, theta=config.rope_theta
         )
+        # Attention dropout (typically 0 during pretraining)
         self.attn_dropout = nn.Dropout(config.dropout)
     def forward(
         Args:
             x: (batch_size, seq_len, hidden_dim)
             mask: (seq_len, seq_len) causal mask
+            position_offset: position offset (used during inference)
         Returns:
             (batch_size, seq_len, hidden_dim)
         B, S, _ = x.shape
         # ──────────────────────────────────────────────
+        # Step 1: Q, K, V projections
         # ──────────────────────────────────────────────
         q = self.q_proj(x)  # (B, S, num_heads × head_dim)
         k = self.k_proj(x)  # (B, S, num_kv_heads × head_dim)
         v = self.v_proj(x)  # (B, S, num_kv_heads × head_dim)
+        # Reshape into multi-head form
         q = q.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
         # → (B, num_heads, S, head_dim)
         k = k.view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
         v = v.view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
         # ──────────────────────────────────────────────
+        # Step 2: Apply RoPE (to Q and K only! Not to V)
         # ──────────────────────────────────────────────
+        # Positional information should only affect "where to attend" (Q·K),
+        # not "what to retrieve" (V).
         q, k = self.rope(q, k, position_offset)
         # ──────────────────────────────────────────────
+        # Step 3: GQA - expand KV heads (repeat)
         # ──────────────────────────────────────────────
+        # num_kv_heads=4 → num_heads=16: repeat each KV 4 times
         if self.num_kv_groups > 1:
             k = self._repeat_kv(k)  # (B, num_heads, S, head_dim)
             v = self._repeat_kv(v)
         # ──────────────────────────────────────────────
         # Step 4: Scaled Dot-Product Attention
         # ──────────────────────────────────────────────
+        # Uses PyTorch >= 2.0's optimized implementation (Flash Attention applied automatically)
         attn_out = F.scaled_dot_product_attention(
             q, k, v,
             attn_mask=mask,
             dropout_p=self.config.dropout if self.training else 0.0,
+            is_causal=(mask is None),  # apply automatic causal masking when no mask is provided
         )
         # → (B, num_heads, S, head_dim)
         # ──────────────────────────────────────────────
+        # Step 5: Merge heads + output projection
         # ──────────────────────────────────────────────
         attn_out = attn_out.transpose(1, 2).contiguous().view(B, S, -1)
         # → (B, S, num_heads × head_dim)
         return self.o_proj(attn_out)  # → (B, S, hidden_dim)
     def _repeat_kv(self, x: torch.Tensor) -> torch.Tensor:
+        """Repeat KV heads to match the number of Q heads.
         (B, num_kv_heads, S, head_dim) → (B, num_heads, S, head_dim)
+        Example: num_kv_heads=4, num_kv_groups=4
           [kv0, kv1, kv2, kv3] → [kv0,kv0,kv0,kv0, kv1,kv1,kv1,kv1, ...]
         """
         B, H_kv, S, D = x.shape

llm_lab/model/feedforward.py CHANGED Viewed

@@ -8,41 +8,41 @@ from llm_lab.config import ModelConfig
 class SwiGLUFeedForward(nn.Module):
-    """SwiGLU: Gated Linear Unit with Swish 활성화 함수.
-    기존 FFN:
       FFN(x) = ReLU(x·W1 + b1)·W2 + b2
-      → 단순한 비선형 변환
     SwiGLU FFN:
       SwiGLU(x) = (Swish(x·W_gate) ⊙ (x·W_up)) · W_down
-      → 게이팅 메커니즘으로 정보 흐름을 제어
-    왜 SwiGLU가 더 좋은가?
-      - Swish(x) = x · sigmoid(x): 부드러운 활성화, 음수 영역 일부 허용
-      - Gate 벡터가 "어떤 정보를 통과시킬지" 학습
-      - PaLM, LLaMA 등에서 ReLU FFN 대비 일관된 성능 향상 보고
-    참고: W_gate와 W_up 두 개의 up-projection이 있어서
-    파라미터 수가 기존 FFN 대비 1.5배이지만, intermediate_dim을
-    조정하여 총 파라미터 수를 맞춥니다.
     """
     def __init__(self, config: ModelConfig):
         super().__init__()
-        # 게이트 프로젝션: hidden_dim → intermediate_dim
         self.gate_proj = nn.Linear(config.hidden_dim, config.intermediate_dim, bias=False)
-        # 업 프로젝션: hidden_dim → intermediate_dim
         self.up_proj   = nn.Linear(config.hidden_dim, config.intermediate_dim, bias=False)
-        # 다운 프로젝션: intermediate_dim → hidden_dim
         self.down_proj = nn.Linear(config.intermediate_dim, config.hidden_dim, bias=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
         #
-        # 1) gate: 어떤 정보를 통과시킬지 결정 (Swish 활성화)
         gate = F.silu(self.gate_proj(x))  # silu = Swish = x * sigmoid(x)
-        # 2) up: 정보를 고차원으로 사영
         up = self.up_proj(x)
-        # 3) element-wise 곱 (게이팅) → 다시 원래 차원으로
         return self.down_proj(gate * up)

 class SwiGLUFeedForward(nn.Module):
+    """SwiGLU: Gated Linear Unit with Swish activation function.
+    Standard FFN:
       FFN(x) = ReLU(x·W1 + b1)·W2 + b2
+      → simple nonlinear transformation
     SwiGLU FFN:
       SwiGLU(x) = (Swish(x·W_gate) ⊙ (x·W_up)) · W_down
+      → controls information flow via a gating mechanism
+    Why is SwiGLU better?
+      - Swish(x) = x · sigmoid(x): smooth activation, allows some negative values
+      - The gate vector learns "which information to let through"
+      - Consistently reported to outperform ReLU FFN in PaLM, LLaMA, etc.
+    Note: Having two up-projections (W_gate and W_up) means
+    1.5x the parameters of a standard FFN, but intermediate_dim is
+    adjusted to match the total parameter count.
     """
     def __init__(self, config: ModelConfig):
         super().__init__()
+        # Gate projection: hidden_dim → intermediate_dim
         self.gate_proj = nn.Linear(config.hidden_dim, config.intermediate_dim, bias=False)
+        # Up projection: hidden_dim → intermediate_dim
         self.up_proj   = nn.Linear(config.hidden_dim, config.intermediate_dim, bias=False)
+        # Down projection: intermediate_dim → hidden_dim
         self.down_proj = nn.Linear(config.intermediate_dim, config.hidden_dim, bias=False)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # SwiGLU(x) = (Swish(gate(x)) ⊙ up(x)) · down
         #
+        # 1) gate: decides which information to pass through (Swish activation)
         gate = F.silu(self.gate_proj(x))  # silu = Swish = x * sigmoid(x)
+        # 2) up: projects information to a higher dimension
         up = self.up_proj(x)
+        # 3) element-wise multiplication (gating) → project back to original dimension
         return self.down_proj(gate * up)

llm_lab/model/llm_model.py CHANGED Viewed

@@ -13,19 +13,19 @@ from .transformer_block import TransformerBlock
 class LLMModel(nn.Module):
-    """1B 파라미터 LLaMA-style Decoder-Only Transformer.
-    전체 구조:
       Input Token IDs
         → Token Embedding
         → [TransformerBlock] × num_layers  (+ Activation Checkpointing)
-        → RMSNorm (최종)
         → Linear Head (→ vocab logits)
     Weight Tying:
-      - 입력 Embedding과 출력 Linear Head의 가중치를 공유
-      - 파라미터 수 절약 (~65M) + 성능 유지/향상
-      - 직관: "단어의 의미 표현"과 "단어 예측"이 같은 공간을 사용
     """
     def __init__(self, config: ModelConfig):
@@ -41,29 +41,29 @@ class LLMModel(nn.Module):
             for i in range(config.num_layers)
         ])
-        # ── 최종 정규화 ──
         self.final_norm = RMSNorm(config.hidden_dim, eps=config.norm_eps)
-        # ── 출력 헤드 (Weight Tying) ──
         self.lm_head = nn.Linear(config.hidden_dim, config.vocab_size, bias=False)
-        # Weight Tying: lm_head의 가중치 = token_embedding의 가중치
         self.lm_head.weight = self.token_embedding.weight
-        # 가중치 초기화
         self._init_weights()
     def _init_weights(self):
-        """가중치 초기화 전략.
-        왜 초기화가 중요한가?
-          - 너무 크면: 활성화 폭발 → NaN
-          - 너무 작으면: gradient 소멸 → 학습 정체
-          - 적절한 초기화: 각 레이어의 출력 분산을 일정하게 유지
-        GPT-2 스타일 초기화:
-          - 일반 Linear: N(0, 0.02)
           - Residual projection: N(0, 0.02 / √(2 × num_layers))
-            → 레이어가 깊어질수록 residual 기여를 줄여 안정화
         """
         std = 0.02
         residual_std = std / math.sqrt(2 * self.config.num_layers)
@@ -76,7 +76,7 @@ class LLMModel(nn.Module):
             elif isinstance(module, nn.Embedding):
                 nn.init.normal_(module.weight, mean=0.0, std=std)
-        # Residual projection 레이어에 축소된 초기화 적용
         for layer in self.layers:
             nn.init.normal_(layer.attention.o_proj.weight, mean=0.0, std=residual_std)
             nn.init.normal_(layer.feed_forward.down_proj.weight, mean=0.0, std=residual_std)
@@ -89,55 +89,55 @@ class LLMModel(nn.Module):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Args:
-            input_ids: (batch_size, seq_len) - 토큰 ID
-            targets:   (batch_size, seq_len) - 정답 토큰 ID (학습 시)
-            position_offset: 위치 오프셋 (추론 시)
         Returns:
             logits: (batch_size, seq_len, vocab_size)
-            loss:   스칼라 (targets 제공 시) 또는 None
         """
         B, S = input_ids.shape
         # ── Step 1: Token Embedding ──
-        # 각 토큰 ID를 hidden_dim 차원의 벡터로 변환
         h = self.token_embedding(input_ids)  # (B, S, hidden_dim)
         # ── Step 2: Transformer Blocks ──
-        # Activation Checkpointing: 학습 시 메모리 절약
-        # (중간 활성화를 저장하지 않고, backward 시 재계산)
         for layer in self.layers:
             if self.training and torch.is_grad_enabled():
-                # Activation Checkpointing 적용
                 h = torch.utils.checkpoint.checkpoint(
                     layer, h, None, position_offset,
-                    use_reentrant=False,  # PyTorch >= 2.0 권장
                 )
             else:
                 h = layer(h, mask=None, position_offset=position_offset)
-        # ── Step 3: 최종 정규화 ──
         h = self.final_norm(h)
-        # ── Step 4: 출력 로짓 계산 ──
         logits = self.lm_head(h)  # (B, S, vocab_size)
-        # ── Step 5: Loss 계산 (학습 시) ──
         loss = None
         if targets is not None:
-            # Cross-Entropy Loss: 다음 토큰 예측
             # logits: (B, S, V) → (B*S, V)
             # targets: (B, S)   → (B*S,)
             loss = F.cross_entropy(
                 logits.view(-1, self.config.vocab_size),
                 targets.view(-1),
-                ignore_index=-100,  # 패딩 토큰 무시
             )
         return logits, loss
     def count_parameters(self, trainable_only: bool = True) -> int:
-        """모델 파라미터 수 계산."""
         if trainable_only:
             return sum(p.numel() for p in self.parameters() if p.requires_grad)
         return sum(p.numel() for p in self.parameters())
@@ -151,50 +151,50 @@ class LLMModel(nn.Module):
         top_k: int = 50,
         top_p: float = 0.9,
     ) -> torch.Tensor:
-        """텍스트 생성 (추론).
-        Autoregressive 생성: 한 토큰씩 예측하여 이어붙이기.
         Args:
-            input_ids: (1, prompt_len) - 초기 프롬프트
-            max_new_tokens: 생성할 최대 토큰 수
-            temperature: 확률 분포 날카로움 조절 (낮을수록 보수적)
-            top_k: 확률 상위 k개만 고려
-            top_p: 누적 확률 p까지만 고려 (nucleus sampling)
         """
         self.eval()
         generated = input_ids
         for _ in range(max_new_tokens):
-            # 현재 시퀀스가 max_seq_len을 초과하면 잘라내기
             ctx = generated[:, -self.config.max_seq_len:]
             # Forward pass
             logits, _ = self(ctx)
-            # 마지막 토큰의 logits만 사용 (다음 토큰 예측)
             next_logits = logits[:, -1, :] / temperature
-            # ── Top-K 필터링 ──
             if top_k > 0:
                 top_k_values, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
                 min_top_k = top_k_values[:, -1].unsqueeze(-1)
                 next_logits = next_logits.masked_fill(next_logits < min_top_k, float("-inf"))
-            # ── Top-P (Nucleus) 필터링 ──
             if top_p < 1.0:
                 sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
                 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-                # 누적 확률이 top_p를 초과하는 토큰 제거
                 remove_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= top_p
                 sorted_logits[remove_mask] = float("-inf")
-                # 원래 순서로 복원
                 next_logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)
-            # 확률 분포에서 샘플링
             probs = F.softmax(next_logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)  # (B, 1)
-            # 생성된 토큰 이어붙이기
             generated = torch.cat([generated, next_token], dim=1)
         return generated

 class LLMModel(nn.Module):
+    """1B parameter LLaMA-style Decoder-Only Transformer.
+    Overall structure:
       Input Token IDs
         → Token Embedding
         → [TransformerBlock] × num_layers  (+ Activation Checkpointing)
+        → RMSNorm (final)
         → Linear Head (→ vocab logits)
     Weight Tying:
+      - Shares weights between the input Embedding and the output Linear Head
+      - Saves parameters (~65M) while maintaining or improving performance
+      - Intuition: "representing word meaning" and "predicting words" use the same space
     """
     def __init__(self, config: ModelConfig):
             for i in range(config.num_layers)
         ])
+        # ── Final normalization ──
         self.final_norm = RMSNorm(config.hidden_dim, eps=config.norm_eps)
+        # ── Output head (Weight Tying) ──
         self.lm_head = nn.Linear(config.hidden_dim, config.vocab_size, bias=False)
+        # Weight Tying: lm_head weights = token_embedding weights
         self.lm_head.weight = self.token_embedding.weight
+        # Weight initialization
         self._init_weights()
     def _init_weights(self):
+        """Weight initialization strategy.
+        Why does initialization matter?
+          - Too large: activation explosion → NaN
+          - Too small: gradient vanishing → training stagnation
+          - Proper initialization: keeps output variance consistent across layers
+        GPT-2 style initialization:
+          - General Linear: N(0, 0.02)
           - Residual projection: N(0, 0.02 / √(2 × num_layers))
+            → reduces residual contribution as depth increases for stability
         """
         std = 0.02
         residual_std = std / math.sqrt(2 * self.config.num_layers)
             elif isinstance(module, nn.Embedding):
                 nn.init.normal_(module.weight, mean=0.0, std=std)
+        # Apply scaled-down initialization to residual projection layers
         for layer in self.layers:
             nn.init.normal_(layer.attention.o_proj.weight, mean=0.0, std=residual_std)
             nn.init.normal_(layer.feed_forward.down_proj.weight, mean=0.0, std=residual_std)
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Args:
+            input_ids: (batch_size, seq_len) - token IDs
+            targets:   (batch_size, seq_len) - ground-truth token IDs (during training)
+            position_offset: position offset (during inference)
         Returns:
             logits: (batch_size, seq_len, vocab_size)
+            loss:   scalar (when targets are provided) or None
         """
         B, S = input_ids.shape
         # ── Step 1: Token Embedding ──
+        # Convert each token ID into a vector of dimension hidden_dim
         h = self.token_embedding(input_ids)  # (B, S, hidden_dim)
         # ── Step 2: Transformer Blocks ──
+        # Activation Checkpointing: saves memory during training
+        # (does not store intermediate activations; recomputes them during backward)
         for layer in self.layers:
             if self.training and torch.is_grad_enabled():
+                # Apply Activation Checkpointing
                 h = torch.utils.checkpoint.checkpoint(
                     layer, h, None, position_offset,
+                    use_reentrant=False,  # recommended for PyTorch >= 2.0
                 )
             else:
                 h = layer(h, mask=None, position_offset=position_offset)
+        # ── Step 3: Final normalization ──
         h = self.final_norm(h)
+        # ── Step 4: Compute output logits ──
         logits = self.lm_head(h)  # (B, S, vocab_size)
+        # ── Step 5: Compute loss (during training) ──
         loss = None
         if targets is not None:
+            # Cross-Entropy Loss: next-token prediction
             # logits: (B, S, V) → (B*S, V)
             # targets: (B, S)   → (B*S,)
             loss = F.cross_entropy(
                 logits.view(-1, self.config.vocab_size),
                 targets.view(-1),
+                ignore_index=-100,  # ignore padding tokens
             )
         return logits, loss
     def count_parameters(self, trainable_only: bool = True) -> int:
+        """Count the number of model parameters."""
         if trainable_only:
             return sum(p.numel() for p in self.parameters() if p.requires_grad)
         return sum(p.numel() for p in self.parameters())
         top_k: int = 50,
         top_p: float = 0.9,
     ) -> torch.Tensor:
+        """Text generation (inference).
+        Autoregressive generation: predicts and appends one token at a time.
         Args:
+            input_ids: (1, prompt_len) - initial prompt
+            max_new_tokens: maximum number of tokens to generate
+            temperature: controls sharpness of probability distribution (lower = more conservative)
+            top_k: consider only the top k tokens by probability
+            top_p: consider only tokens up to cumulative probability p (nucleus sampling)
         """
         self.eval()
         generated = input_ids
         for _ in range(max_new_tokens):
+            # Truncate if current sequence exceeds max_seq_len
             ctx = generated[:, -self.config.max_seq_len:]
             # Forward pass
             logits, _ = self(ctx)
+            # Use only the last token's logits (next-token prediction)
             next_logits = logits[:, -1, :] / temperature
+            # ── Top-K filtering ──
             if top_k > 0:
                 top_k_values, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
                 min_top_k = top_k_values[:, -1].unsqueeze(-1)
                 next_logits = next_logits.masked_fill(next_logits < min_top_k, float("-inf"))
+            # ── Top-P (Nucleus) filtering ──
             if top_p < 1.0:
                 sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
                 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                # Remove tokens where cumulative probability exceeds top_p
                 remove_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= top_p
                 sorted_logits[remove_mask] = float("-inf")
+                # Restore original order
                 next_logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)
+            # Sample from probability distribution
             probs = F.softmax(next_logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)  # (B, 1)
+            # Append generated token
             generated = torch.cat([generated, next_token], dim=1)
         return generated

llm_lab/model/norm.py CHANGED Viewed

@@ -5,36 +5,36 @@ import torch.nn as nn
 class RMSNorm(nn.Module):
-    """RMSNorm: LayerNorm의 경량화 버전.
-    일반 LayerNorm과의 차이:
-      - 평균(mean)을 빼지 않음 → 연산 절약
-      - 분산 대신 RMS(Root Mean Square)로 정규화
-      - bias 파라미터 없음
-    수식:
       RMSNorm(x) = (x / RMS(x)) * γ
       RMS(x) = sqrt(mean(x²) + ε)
-    왜 정규화가 필요한가?
-      → 레이어를 깊게 쌓으면 활성화 값의 스케일이 폭발하거나 소멸합니다.
-      → 정규화로 각 레이어의 입력을 안정적인 범위로 유지합니다.
     """
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
         self.eps = eps
-        # γ (gamma): 학습 가능한 스케일 파라미터, 1로 초기화
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # 1) 입력을 float32로 변환 (수치 안정성)
-        #    bf16/fp16 상태에서 제곱합을 구하면 오버플로우 위험
         x_float = x.float()
-        # 2) RMS 계산: sqrt(mean(x²) + ε)
         rms = torch.rsqrt(x_float.pow(2).mean(dim=-1, keepdim=True) + self.eps)
-        # rsqrt = 1/sqrt(x) → 나눗셈 대신 곱셈으로 대체 (더 빠름)
-        # 3) 정규화 후 원래 dtype으로 복원, 스케일 적용
         return (x_float * rms).to(x.dtype) * self.weight

 class RMSNorm(nn.Module):
+    """RMSNorm: A lightweight alternative to LayerNorm.
+    Differences from standard LayerNorm:
+      - Does not subtract the mean → saves computation
+      - Normalizes using RMS (Root Mean Square) instead of variance
+      - No bias parameter
+    Formula:
       RMSNorm(x) = (x / RMS(x)) * γ
       RMS(x) = sqrt(mean(x²) + ε)
+    Why is normalization necessary?
+      → Stacking layers deeply causes activation values to explode or vanish.
+      → Normalization keeps the input to each layer within a stable range.
     """
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
         self.eps = eps
+        # γ (gamma): learnable scale parameter, initialized to 1
         self.weight = nn.Parameter(torch.ones(dim))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # 1) Cast input to float32 for numerical stability
+        #    Computing the sum of squares in bf16/fp16 risks overflow
         x_float = x.float()
+        # 2) Compute RMS: sqrt(mean(x²) + ε)
         rms = torch.rsqrt(x_float.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        # rsqrt = 1/sqrt(x) → replaces division with multiplication (faster)
+        # 3) Normalize, restore original dtype, and apply scale
         return (x_float * rms).to(x.dtype) * self.weight

llm_lab/model/rope.py CHANGED Viewed

@@ -7,21 +7,23 @@ import torch.nn as nn
 class RotaryPositionalEmbedding(nn.Module):
-    """RoPE: 회전 행렬을 이용한 상대 위치 인코딩.
-    핵심 아이디어:
-      - 각 차원 쌍(2i, 2i+1)을 2D 평면의 좌표로 보고,
-        위치(position)에 비례한 각도만큼 회전시킵니다.
-      - 두 토큰의 어텐션 스코어(Q·K)는 상대 거리에만 의존하게 됩니다.
-    왜 RoPE인가?
-      - 절대 위치 임베딩: 각 위치에 고정 벡터를 더함 → 길이 일반화 어려움
-      - 상대 위치 임베딩: 구현 복잡, 추가 파라미터 필요
-      - RoPE: 파라미터 없이, 자연스럽게 상대 위치 정보 인코딩
-    수식:
       θ_i = theta^(-2i/d)  (i = 0, 1, ..., d/2-1)
-      RoPE(x, pos) = x를 각 차원 쌍에서 pos × θ_i 만큼 회전
     """
     def __init__(self, dim: int, max_seq_len: int = 2048, theta: float = 10000.0):
@@ -30,16 +32,16 @@ class RotaryPositionalEmbedding(nn.Module):
         self.max_seq_len = max_seq_len
         self.theta = theta
-        # 주파수 벡터 미리 계산 (학습 불필요 → buffer로 등록)
         # freqs[i] = 1 / (theta^(2i/dim)), i = 0, 1, ..., dim/2-1
         freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("freqs", freqs, persistent=False)
-        # (max_seq_len, dim/2) 크기의 cos/sin 테이블 미리 계산
         self._build_cache(max_seq_len)
     def _build_cache(self, seq_len: int):
-        """cos/sin 값을 미리 계산하여 캐싱합니다."""
         t = torch.arange(seq_len, device=self.freqs.device, dtype=torch.float32)
         # outer product: (seq_len,) × (dim/2,) → (seq_len, dim/2)
         angles = torch.outer(t, self.freqs)
@@ -49,23 +51,23 @@ class RotaryPositionalEmbedding(nn.Module):
     def forward(
         self, q: torch.Tensor, k: torch.Tensor, position_offset: int = 0
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Q, K에 회전 변환을 적용합니다.
         Args:
             q: (batch, num_heads, seq_len, head_dim)
             k: (batch, num_kv_heads, seq_len, head_dim)
-            position_offset: 시퀀스 시작 위치 오프셋 (추론 시 KV 캐시 사용 시)
         Returns:
-            회전 변환이 적용된 (q_rotated, k_rotated)
         """
         seq_len = q.shape[2]
-        # 필요 시 캐시 확장
         if position_offset + seq_len > self.cos_cached.shape[0]:
             self._build_cache(position_offset + seq_len)
-        # 현재 위치에 해당하는 cos/sin 슬라이스
         cos = self.cos_cached[position_offset : position_offset + seq_len]  # (seq_len, dim/2)
         sin = self.sin_cached[position_offset : position_offset + seq_len]
@@ -77,27 +79,27 @@ class RotaryPositionalEmbedding(nn.Module):
     def _apply_rotation(
         x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
     ) -> torch.Tensor:
-        """회전 변환 적용.
-        2D 회전 행렬:
           [cos θ, -sin θ] [x1]   [x1·cos θ - x2·sin θ]
           [sin θ,  cos θ] [x2] = [x1·sin θ + x2·cos θ]
-        이를 벡터 연산으로 효율적으로 구현합니다.
         """
         # x: (batch, heads, seq_len, head_dim)
-        # 짝수/홀수 인덱스를 분리: (x0, x1, x2, x3, ...) → (x0, x2, ...), (x1, x3, ...)
-        x_even = x[..., 0::2]  # 짝수 인덱스
-        x_odd  = x[..., 1::2]  # 홀수 인덱스
-        # 브로드캐스팅을 위해 차원 맞춤: (seq_len, dim/2) → (1, 1, seq_len, dim/2)
         cos = cos.unsqueeze(0).unsqueeze(0)
         sin = sin.unsqueeze(0).unsqueeze(0)
-        # 회전 적용
         rotated_even = x_even * cos - x_odd * sin
         rotated_odd  = x_even * sin + x_odd * cos
-        # 다시 인터리빙: (even0, odd0, even1, odd1, ...)
         out = torch.stack([rotated_even, rotated_odd], dim=-1)
-        return out.flatten(-2)  # 마지막 두 차원을 합쳐 원래 shape 복원

 class RotaryPositionalEmbedding(nn.Module):
+    """RoPE: Relative positional encoding using rotation matrices.
+    Core idea:
+      - Each dimension pair (2i, 2i+1) is treated as coordinates in a 2D plane,
+        and is rotated by an angle proportional to the position.
+      - The attention score (Q·K) between two tokens depends only on their
+        relative distance.
+    Why RoPE?
+      - Absolute positional embeddings: add a fixed vector at each position
+        → difficult to generalize to longer sequences
+      - Relative positional embeddings: complex implementation, extra parameters needed
+      - RoPE: encodes relative position information naturally with no extra parameters
+    Formula:
       θ_i = theta^(-2i/d)  (i = 0, 1, ..., d/2-1)
+      RoPE(x, pos) = rotate x in each dimension pair by pos × θ_i
     """
     def __init__(self, dim: int, max_seq_len: int = 2048, theta: float = 10000.0):
         self.max_seq_len = max_seq_len
         self.theta = theta
+        # Pre-compute frequency vector (no training needed → register as buffer)
         # freqs[i] = 1 / (theta^(2i/dim)), i = 0, 1, ..., dim/2-1
         freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("freqs", freqs, persistent=False)
+        # Pre-compute cos/sin table of shape (max_seq_len, dim/2)
         self._build_cache(max_seq_len)
     def _build_cache(self, seq_len: int):
+        """Pre-compute and cache cos/sin values."""
         t = torch.arange(seq_len, device=self.freqs.device, dtype=torch.float32)
         # outer product: (seq_len,) × (dim/2,) → (seq_len, dim/2)
         angles = torch.outer(t, self.freqs)
     def forward(
         self, q: torch.Tensor, k: torch.Tensor, position_offset: int = 0
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply rotary transformation to Q and K.
         Args:
             q: (batch, num_heads, seq_len, head_dim)
             k: (batch, num_kv_heads, seq_len, head_dim)
+            position_offset: sequence start position offset (used with KV cache during inference)
         Returns:
+            (q_rotated, k_rotated) with rotary transformation applied
         """
         seq_len = q.shape[2]
+        # Extend cache if needed
         if position_offset + seq_len > self.cos_cached.shape[0]:
             self._build_cache(position_offset + seq_len)
+        # Slice cos/sin values for the current positions
         cos = self.cos_cached[position_offset : position_offset + seq_len]  # (seq_len, dim/2)
         sin = self.sin_cached[position_offset : position_offset + seq_len]
     def _apply_rotation(
         x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
     ) -> torch.Tensor:
+        """Apply rotation transformation.
+        2D rotation matrix:
           [cos θ, -sin θ] [x1]   [x1·cos θ - x2·sin θ]
           [sin θ,  cos θ] [x2] = [x1·sin θ + x2·cos θ]
+        Implemented efficiently using vectorized operations.
         """
         # x: (batch, heads, seq_len, head_dim)
+        # Separate even/odd indices: (x0, x1, x2, x3, ...) → (x0, x2, ...), (x1, x3, ...)
+        x_even = x[..., 0::2]  # even indices
+        x_odd  = x[..., 1::2]  # odd indices
+        # Adjust dimensions for broadcasting: (seq_len, dim/2) → (1, 1, seq_len, dim/2)
         cos = cos.unsqueeze(0).unsqueeze(0)
         sin = sin.unsqueeze(0).unsqueeze(0)
+        # Apply rotation
         rotated_even = x_even * cos - x_odd * sin
         rotated_odd  = x_even * sin + x_odd * cos
+        # Re-interleave: (even0, odd0, even1, odd1, ...)
         out = torch.stack([rotated_even, rotated_odd], dim=-1)
+        return out.flatten(-2)  # Merge last two dimensions to restore original shape

llm_lab/model/transformer_block.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Transformer Block (하나의 레이어)."""
 from typing import Optional
@@ -12,32 +12,32 @@ from .feedforward import SwiGLUFeedForward
 class TransformerBlock(nn.Module):
-    """하나의 Transformer 디코더 블록.
-    구조 (Pre-Norm 방식):
       x → RMSNorm → Attention → + (residual) → RMSNorm → FFN → + (residual) → out
     Pre-Norm vs Post-Norm:
-      - Post-Norm (원래 Transformer): LayerNorm이 residual 이후
-        → 깊은 모델에서 학습 불안정
-      - Pre-Norm (GPT-2 이후 표준): LayerNorm이 sublayer 이전
-        → gradient 흐름이 원활, 학습이 안정적
-    Residual Connection의 역할:
-      - 입력을 출력에 더함 → gradient가 레이어를 건너뛸 수 있는 "고속도로"
-      - 22개 레이어를 쌓아도 학습이 가능한 핵심 이유
     """
     def __init__(self, config: ModelConfig, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
-        # Pre-Norm: Attention 전 정규화
         self.attn_norm = RMSNorm(config.hidden_dim, eps=config.norm_eps)
         # Self-Attention
         self.attention = GroupedQueryAttention(config)
-        # Pre-Norm: FFN 전 정규화
         self.ffn_norm = RMSNorm(config.hidden_dim, eps=config.norm_eps)
         # Feed-Forward Network
         self.feed_forward = SwiGLUFeedForward(config)

+"""Transformer Block (a single layer)."""
 from typing import Optional
 class TransformerBlock(nn.Module):
+    """A single Transformer decoder block.
+    Structure (Pre-Norm style):
       x → RMSNorm → Attention → + (residual) → RMSNorm → FFN → + (residual) → out
     Pre-Norm vs Post-Norm:
+      - Post-Norm (original Transformer): LayerNorm applied after the residual
+        → training instability in deep models
+      - Pre-Norm (standard since GPT-2): LayerNorm applied before the sublayer
+        → smooth gradient flow, stable training
+    Role of Residual Connection:
+      - Adds the input to the output → a "highway" that lets gradients skip layers
+      - The key reason training is feasible even with 22 stacked layers
     """
     def __init__(self, config: ModelConfig, layer_idx: int):
         super().__init__()
         self.layer_idx = layer_idx
+        # Pre-Norm: normalization before Attention
         self.attn_norm = RMSNorm(config.hidden_dim, eps=config.norm_eps)
         # Self-Attention
         self.attention = GroupedQueryAttention(config)
+        # Pre-Norm: normalization before FFN
         self.ffn_norm = RMSNorm(config.hidden_dim, eps=config.norm_eps)
         # Feed-Forward Network
         self.feed_forward = SwiGLUFeedForward(config)

llm_lab/model/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""모델 유틸리티 함수."""
 from __future__ import annotations
@@ -12,7 +12,7 @@ if TYPE_CHECKING:
 def count_parameters_detailed(model: "LLMModel") -> dict:
-    """모델의 파라미터 수를 컴포넌트별로 상세 출력합니다."""
     total = 0
     breakdown = {}
@@ -21,7 +21,7 @@ def count_parameters_detailed(model: "LLMModel") -> dict:
     breakdown["token_embedding"] = emb_params
     total += emb_params
-    # 각 레이어
     layer_total = 0
     layer_detail = {}
     layer = model.layers[0]
@@ -40,7 +40,7 @@ def count_parameters_detailed(model: "LLMModel") -> dict:
     breakdown["final_norm"] = norm_params
     total += norm_params
-    # LM head (weight tying이므로 실제 추가 파라미터 0)
     breakdown["lm_head"] = "weight tying (0 additional)"
     breakdown["total"] = total
@@ -48,12 +48,12 @@ def count_parameters_detailed(model: "LLMModel") -> dict:
 def estimate_memory_gb(config: ModelConfig, batch_size: int = 4, dtype_bytes: int = 2) -> dict:
-    """모델의 GPU 메모리 사용량을 추정합니다.
     Args:
-        dtype_bytes: 2 (bf16/fp16) 또는 4 (fp32)
     """
-    # 대략적인 파라미터 수 계산
     emb = config.vocab_size * config.hidden_dim
     per_layer = (
         config.hidden_dim * (config.num_heads + 2 * config.num_kv_heads) * config.head_dim  # QKV
@@ -67,11 +67,11 @@ def estimate_memory_gb(config: ModelConfig, batch_size: int = 4, dtype_bytes: in
     optimizer_gb = total_params * 8 / 1e9  # AdamW: 2 states × fp32
     gradient_gb = total_params * dtype_bytes / 1e9
-    # 활성화 메모리 (activation checkpointing 적용 가정)
-    # 대략적 추정: batch_size × seq_len × hidden_dim × num_layers × factor
     activation_gb = (
-        batch_size * config.max_seq_len * config.hidden_dim * 4  # 바이트
-        * math.sqrt(config.num_layers)  # checkpointing 효과
         / 1e9
     )

+"""Model utility functions."""
 from __future__ import annotations
 def count_parameters_detailed(model: "LLMModel") -> dict:
+    """Print a detailed breakdown of the model's parameter count by component."""
     total = 0
     breakdown = {}
     breakdown["token_embedding"] = emb_params
     total += emb_params
+    # Per layer
     layer_total = 0
     layer_detail = {}
     layer = model.layers[0]
     breakdown["final_norm"] = norm_params
     total += norm_params
+    # LM head (weight tying, so 0 additional parameters)
     breakdown["lm_head"] = "weight tying (0 additional)"
     breakdown["total"] = total
 def estimate_memory_gb(config: ModelConfig, batch_size: int = 4, dtype_bytes: int = 2) -> dict:
+    """Estimate GPU memory usage of the model.
     Args:
+        dtype_bytes: 2 (bf16/fp16) or 4 (fp32)
     """
+    # Approximate parameter count
     emb = config.vocab_size * config.hidden_dim
     per_layer = (
         config.hidden_dim * (config.num_heads + 2 * config.num_kv_heads) * config.head_dim  # QKV
     optimizer_gb = total_params * 8 / 1e9  # AdamW: 2 states × fp32
     gradient_gb = total_params * dtype_bytes / 1e9
+    # Activation memory (assuming activation checkpointing is applied)
+    # Rough estimate: batch_size × seq_len × hidden_dim × num_layers × factor
     activation_gb = (
+        batch_size * config.max_seq_len * config.hidden_dim * 4  # bytes
+        * math.sqrt(config.num_layers)  # effect of checkpointing
         / 1e9
     )

llm_lab/training/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""학습 모듈 — Gradient Accumulation, Mixed Precision, 체크포인트, wandb 로깅."""
 from .scheduler import CosineWarmupScheduler
 from .checkpoint import CheckpointManager
 from .metrics import MetricsTracker

+"""Training module — Gradient Accumulation, Mixed Precision, checkpointing, wandb logging."""
 from .scheduler import CosineWarmupScheduler
 from .checkpoint import CheckpointManager
 from .metrics import MetricsTracker

llm_lab/training/checkpoint.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""학습 상태 저장/복원 관리자."""
 import json
 import shutil
@@ -13,22 +13,22 @@ from llm_lab.config import TrainConfig
 class CheckpointManager:
-    """학습 상태 저장/복원 관리자.
-    Colab에서 체크포인트가 중요한 이유:
-      - 세션 만료 (최대 ~24시간) 시 모든 메모리 상태 소멸
-      - Google Drive에 저장하면 세션 간 연속 학습 가능
-      - 옵티마이저 상태까지 저장해야 AdamW 모멘텀이 유지됨
-    저장 내용:
-      - model_state_dict:     모델 가중치
-      - optimizer_state_dict: 옵티마이저 상태 (m, v 모멘텀)
-      - step:                 현재 학습 스텝
-      - best_val_loss:        최저 검증 Loss
-      - config:               학습 설정 (재현성)
-      - rng_states:           랜덤 시드 상태 (완전 재현)
-      - metrics_history:      학습 메트릭 기록
-      - wandb_run_id:         wandb 실행 ID (로깅 연속성)
     """
     def __init__(self, config: TrainConfig):
@@ -46,20 +46,20 @@ class CheckpointManager:
         metrics_history: Dict[str, list],
         wandb_run_id: Optional[str] = None,
     ):
-        """체크포인트를 저장합니다."""
         ckpt_path = self.checkpoint_dir / f"step_{step:06d}"
         ckpt_path.mkdir(parents=True, exist_ok=True)
-        print(f"\n💾 체크포인트 저장: {ckpt_path}")
         start = time.time()
-        # 1) 모델 가중치 (bf16 상태 그대로)
         torch.save(model.state_dict(), ckpt_path / "model.pt")
-        # 2) 옵티마이저 상태 (fp32 모멘텀 포함, 크기 큼)
         torch.save(optimizer.state_dict(), ckpt_path / "optimizer.pt")
-        # 3) 학습 메타 정보
         meta = {
             "step": step,
             "best_val_loss": best_val_loss,
@@ -69,10 +69,10 @@ class CheckpointManager:
         with open(ckpt_path / "meta.json", "w") as f:
             json.dump(meta, f, indent=2)
-        # 4) 메트릭 기록
         torch.save(metrics_history, ckpt_path / "metrics.pt")
-        # 5) 랜덤 상태 (완전 재현을 위해)
         rng_states = {
             "python": torch.random.get_rng_state(),
             "cuda": torch.cuda.get_rng_state() if torch.cuda.is_available() else None,
@@ -81,9 +81,9 @@ class CheckpointManager:
         elapsed = time.time() - start
         ckpt_size = sum(f.stat().st_size for f in ckpt_path.rglob("*")) / 1e9
-        print(f"   저장 완료: {ckpt_size:.2f} GB, {elapsed:.1f}초")
-        # 오래된 체크포인트 삭제 (롤링)
         self._cleanup_old_checkpoints()
     def load_latest(
@@ -92,42 +92,42 @@ class CheckpointManager:
         optimizer: Optional[torch.optim.Optimizer] = None,
         device: torch.device = torch.device("cpu"),
     ) -> Dict[str, Any]:
-        """가장 최근 체크포인트를 로드합니다.
         Returns:
             {"step", "best_val_loss", "wandb_run_id", "metrics_history"}
-            또는 체크포인트가 없으면 None
         """
         ckpt_path = self._find_latest()
         if ckpt_path is None:
-            print("[Checkpoint] 저장된 체크포인트 없음. 처음부터 시작합니다.")
             return None
-        print(f"\n📂 체크포인트 로드: {ckpt_path}")
         start = time.time()
-        # 1) 모델 가중치
         model_state = torch.load(ckpt_path / "model.pt", map_location=device, weights_only=True)
         model.load_state_dict(model_state)
-        del model_state  # 메모리 해제
-        # 2) 옵티마이저 상태
         if optimizer is not None:
             optim_state = torch.load(ckpt_path / "optimizer.pt", map_location=device, weights_only=True)
             optimizer.load_state_dict(optim_state)
             del optim_state
-        # 3) 메타 정보
         with open(ckpt_path / "meta.json", "r") as f:
             meta = json.load(f)
-        # 4) 메트릭 기록
         metrics_history = {}
         metrics_path = ckpt_path / "metrics.pt"
         if metrics_path.exists():
             metrics_history = torch.load(metrics_path, weights_only=False)
-        # 5) 랜덤 상태 복원
         rng_path = ckpt_path / "rng_states.pt"
         if rng_path.exists():
             rng_states = torch.load(rng_path, weights_only=False)
@@ -136,7 +136,7 @@ class CheckpointManager:
                 torch.cuda.set_rng_state(rng_states["cuda"])
         elapsed = time.time() - start
-        print(f"   로드 완료: step={meta['step']}, {elapsed:.1f}초")
         return {
             "step": meta["step"],
@@ -146,14 +146,14 @@ class CheckpointManager:
         }
     def _find_latest(self) -> Optional[Path]:
-        """가장 최근 체크포인트 경로를 찾습니다."""
         ckpts = sorted(self.checkpoint_dir.glob("step_*"))
         return ckpts[-1] if ckpts else None
     def _cleanup_old_checkpoints(self):
-        """오래된 체크포인트를 삭제합니다 (롤링)."""
         ckpts = sorted(self.checkpoint_dir.glob("step_*"))
         while len(ckpts) > self.max_checkpoints:
             old = ckpts.pop(0)
-            print(f"   🗑️ 오래된 체크포인트 삭제: {old.name}")
             shutil.rmtree(old)

+"""Training state save/restore manager."""
 import json
 import shutil
 class CheckpointManager:
+    """Training state save/restore manager.
+    Why checkpoints matter in Colab:
+      - Session expiry (up to ~24 hours) causes all in-memory state to be lost
+      - Saving to Google Drive enables continuous training across sessions
+      - Optimizer state must be saved to preserve AdamW momentum
+    Saved contents:
+      - model_state_dict:     model weights
+      - optimizer_state_dict: optimizer state (m, v momentum)
+      - step:                 current training step
+      - best_val_loss:        lowest validation loss
+      - config:               training configuration (for reproducibility)
+      - rng_states:           random seed state (full reproducibility)
+      - metrics_history:      training metrics history
+      - wandb_run_id:         wandb run ID (for logging continuity)
     """
     def __init__(self, config: TrainConfig):
         metrics_history: Dict[str, list],
         wandb_run_id: Optional[str] = None,
     ):
+        """Saves a checkpoint."""
         ckpt_path = self.checkpoint_dir / f"step_{step:06d}"
         ckpt_path.mkdir(parents=True, exist_ok=True)
+        print(f"\n💾 Saving checkpoint: {ckpt_path}")
         start = time.time()
+        # 1) Model weights (saved as-is in bf16)
         torch.save(model.state_dict(), ckpt_path / "model.pt")
+        # 2) Optimizer state (includes fp32 momentum, can be large)
         torch.save(optimizer.state_dict(), ckpt_path / "optimizer.pt")
+        # 3) Training metadata
         meta = {
             "step": step,
             "best_val_loss": best_val_loss,
         with open(ckpt_path / "meta.json", "w") as f:
             json.dump(meta, f, indent=2)
+        # 4) Metrics history
         torch.save(metrics_history, ckpt_path / "metrics.pt")
+        # 5) Random states (for full reproducibility)
         rng_states = {
             "python": torch.random.get_rng_state(),
             "cuda": torch.cuda.get_rng_state() if torch.cuda.is_available() else None,
         elapsed = time.time() - start
         ckpt_size = sum(f.stat().st_size for f in ckpt_path.rglob("*")) / 1e9
+        print(f"   Save complete: {ckpt_size:.2f} GB, {elapsed:.1f}s")
+        # Remove old checkpoints (rolling)
         self._cleanup_old_checkpoints()
     def load_latest(
         optimizer: Optional[torch.optim.Optimizer] = None,
         device: torch.device = torch.device("cpu"),
     ) -> Dict[str, Any]:
+        """Loads the most recent checkpoint.
         Returns:
             {"step", "best_val_loss", "wandb_run_id", "metrics_history"}
+            or None if no checkpoint exists
         """
         ckpt_path = self._find_latest()
         if ckpt_path is None:
+            print("[Checkpoint] No saved checkpoint found. Starting from scratch.")
             return None
+        print(f"\n📂 Loading checkpoint: {ckpt_path}")
         start = time.time()
+        # 1) Model weights
         model_state = torch.load(ckpt_path / "model.pt", map_location=device, weights_only=True)
         model.load_state_dict(model_state)
+        del model_state  # free memory
+        # 2) Optimizer state
         if optimizer is not None:
             optim_state = torch.load(ckpt_path / "optimizer.pt", map_location=device, weights_only=True)
             optimizer.load_state_dict(optim_state)
             del optim_state
+        # 3) Metadata
         with open(ckpt_path / "meta.json", "r") as f:
             meta = json.load(f)
+        # 4) Metrics history
         metrics_history = {}
         metrics_path = ckpt_path / "metrics.pt"
         if metrics_path.exists():
             metrics_history = torch.load(metrics_path, weights_only=False)
+        # 5) Restore random states
         rng_path = ckpt_path / "rng_states.pt"
         if rng_path.exists():
             rng_states = torch.load(rng_path, weights_only=False)
                 torch.cuda.set_rng_state(rng_states["cuda"])
         elapsed = time.time() - start
+        print(f"   Load complete: step={meta['step']}, {elapsed:.1f}s")
         return {
             "step": meta["step"],
         }
     def _find_latest(self) -> Optional[Path]:
+        """Finds the path of the most recent checkpoint."""
         ckpts = sorted(self.checkpoint_dir.glob("step_*"))
         return ckpts[-1] if ckpts else None
     def _cleanup_old_checkpoints(self):
+        """Removes old checkpoints (rolling)."""
         ckpts = sorted(self.checkpoint_dir.glob("step_*"))
         while len(ckpts) > self.max_checkpoints:
             old = ckpts.pop(0)
+            print(f"   🗑️ Removing old checkpoint: {old.name}")
             shutil.rmtree(old)

llm_lab/training/metrics.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""학습 메트릭 추적 및 로깅."""
 from typing import Dict, Optional
@@ -8,16 +8,16 @@ from llm_lab.config import TrainConfig
 class MetricsTracker:
-    """학습 메트릭을 추적하고 로깅합니다.
-    추적 항목:
-      - train/loss:      학습 Loss (Cross-Entropy)
-      - train/lr:        현재 학습률
-      - train/grad_norm: Gradient L2 Norm
-      - train/tokens_per_sec: 처리량
-      - train/gpu_mem_gb: GPU 메모리 사용량
-      - val/loss:        검증 Loss
-      - val/perplexity:  검증 Perplexity (= exp(loss))
     """
     def __init__(self, config: TrainConfig):
@@ -33,13 +33,13 @@ class MetricsTracker:
             "val_ppl": [],
         }
-        # wandb 초기화
         self.wandb_run = None
         if config.use_wandb:
             self._init_wandb()
     def _init_wandb(self, resume_id: Optional[str] = None):
-        """wandb 초기화 (세션 간 연속 로깅 지원)."""
         try:
             import wandb
@@ -51,16 +51,16 @@ class MetricsTracker:
                 resume="allow",
                 config=self.config.__dict__,
             )
-            print(f"[wandb] 초기화 완료: {self.wandb_run.url}")
         except ImportError:
-            print("[wandb] 설치되지 않음. 콘솔 로깅만 사용합니다.")
             self.config.use_wandb = False
         except Exception as e:
-            print(f"[wandb] 초기화 실패: {e}. 콘솔 로깅만 사용합니다.")
             self.config.use_wandb = False
     def resume_wandb(self, run_id: str):
-        """이전 wandb 실행을 이어서 로깅합니다."""
         if self.config.use_wandb:
             self._init_wandb(resume_id=run_id)
@@ -73,7 +73,7 @@ class MetricsTracker:
         tokens_per_sec: float,
         gpu_mem_gb: float,
     ):
-        """학습 스텝 메트릭을 기록합니다."""
         self.history["step"].append(step)
         self.history["train_loss"].append(loss)
         self.history["learning_rate"].append(lr)
@@ -93,7 +93,7 @@ class MetricsTracker:
             }, step=step)
     def log_eval(self, step: int, val_loss: float, val_ppl: float):
-        """검증 메트릭을 기록합니다."""
         self.history["val_loss"].append(val_loss)
         self.history["val_ppl"].append(val_ppl)

+"""Training metrics tracking and logging."""
 from typing import Dict, Optional
 class MetricsTracker:
+    """Tracks and logs training metrics.
+    Tracked items:
+      - train/loss:      training loss (Cross-Entropy)
+      - train/lr:        current learning rate
+      - train/grad_norm: gradient L2 norm
+      - train/tokens_per_sec: throughput
+      - train/gpu_mem_gb: GPU memory usage
+      - val/loss:        validation loss
+      - val/perplexity:  validation perplexity (= exp(loss))
     """
     def __init__(self, config: TrainConfig):
             "val_ppl": [],
         }
+        # wandb initialization
         self.wandb_run = None
         if config.use_wandb:
             self._init_wandb()
     def _init_wandb(self, resume_id: Optional[str] = None):
+        """Initializes wandb (supports continuous logging across sessions)."""
         try:
             import wandb
                 resume="allow",
                 config=self.config.__dict__,
             )
+            print(f"[wandb] Initialized: {self.wandb_run.url}")
         except ImportError:
+            print("[wandb] Not installed. Using console logging only.")
             self.config.use_wandb = False
         except Exception as e:
+            print(f"[wandb] Initialization failed: {e}. Using console logging only.")
             self.config.use_wandb = False
     def resume_wandb(self, run_id: str):
+        """Resumes logging from a previous wandb run."""
         if self.config.use_wandb:
             self._init_wandb(resume_id=run_id)
         tokens_per_sec: float,
         gpu_mem_gb: float,
     ):
+        """Records training step metrics."""
         self.history["step"].append(step)
         self.history["train_loss"].append(loss)
         self.history["learning_rate"].append(lr)
             }, step=step)
     def log_eval(self, step: int, val_loss: float, val_ppl: float):
+        """Records validation metrics."""
         self.history["val_loss"].append(val_loss)
         self.history["val_ppl"].append(val_ppl)

llm_lab/training/optimizer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""AdamW 옵티마이저 생성 (Weight Decay 분리)."""
 import torch
 import torch.nn as nn
@@ -7,19 +7,19 @@ from llm_lab.config import TrainConfig
 def create_optimizer(model: nn.Module, config: TrainConfig) -> torch.optim.AdamW:
-    """AdamW 옵티마이저를 생성합니다.
-    Weight Decay 분리 규칙:
-      - Decay 적용: Linear 가중치 (attention proj, FFN 등)
-      - Decay 미적용: Embedding, LayerNorm/RMSNorm, Bias
-    왜 분리하는가?
-      - Weight Decay는 큰 가중치에 패널티를 주어 과적합 방지
-      - 하지만 Norm의 scale 파라미터에 적용하면 정규화 효과를 방해
-      - Embedding에 적용하면 희귀 토큰의 표현이 0으로 수축
-      - 1D 파라미터(bias, norm weight)는 decay에서 제외하는 것이 관례
     """
-    # 파라미터를 decay/no-decay 그룹으로 분리
     decay_params = []
     no_decay_params = []
@@ -27,7 +27,7 @@ def create_optimizer(model: nn.Module, config: TrainConfig) -> torch.optim.AdamW
         if not param.requires_grad:
             continue
-        # 1D 텐서(bias, norm weight) 또는 embedding → no decay
         if param.dim() <= 1 or "embedding" in name:
             no_decay_params.append(param)
         else:
@@ -40,15 +40,15 @@ def create_optimizer(model: nn.Module, config: TrainConfig) -> torch.optim.AdamW
     n_decay = sum(p.numel() for p in decay_params)
     n_no_decay = sum(p.numel() for p in no_decay_params)
-    print(f"[Optimizer] Decay 파라미터: {n_decay:,} ({n_decay/1e6:.1f}M)")
-    print(f"[Optimizer] No-decay 파라미터: {n_no_decay:,} ({n_no_decay/1e6:.1f}M)")
     optimizer = torch.optim.AdamW(
         param_groups,
         lr=config.learning_rate,
         betas=(config.beta1, config.beta2),
         eps=config.adam_eps,
-        fused=torch.cuda.is_available(),  # CUDA fused AdamW (더 빠름)
     )
     return optimizer

+"""AdamW optimizer creation with Weight Decay separation."""
 import torch
 import torch.nn as nn
 def create_optimizer(model: nn.Module, config: TrainConfig) -> torch.optim.AdamW:
+    """Creates an AdamW optimizer.
+    Weight Decay separation rules:
+      - Apply decay: Linear weights (attention proj, FFN, etc.)
+      - No decay: Embeddings, LayerNorm/RMSNorm, Bias
+    Why separate?
+      - Weight Decay penalizes large weights to prevent overfitting
+      - However, applying it to Norm scale parameters interferes with normalization
+      - Applying it to Embeddings causes rare token representations to shrink toward 0
+      - It is convention to exclude 1D parameters (bias, norm weight) from decay
     """
+    # Separate parameters into decay / no-decay groups
     decay_params = []
     no_decay_params = []
         if not param.requires_grad:
             continue
+        # 1D tensors (bias, norm weight) or embedding → no decay
         if param.dim() <= 1 or "embedding" in name:
             no_decay_params.append(param)
         else:
     n_decay = sum(p.numel() for p in decay_params)
     n_no_decay = sum(p.numel() for p in no_decay_params)
+    print(f"[Optimizer] Decay parameters: {n_decay:,} ({n_decay/1e6:.1f}M)")
+    print(f"[Optimizer] No-decay parameters: {n_no_decay:,} ({n_no_decay/1e6:.1f}M)")
     optimizer = torch.optim.AdamW(
         param_groups,
         lr=config.learning_rate,
         betas=(config.beta1, config.beta2),
         eps=config.adam_eps,
+        fused=torch.cuda.is_available(),  # CUDA fused AdamW (faster)
     )
     return optimizer

llm_lab/training/runner.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""학습 실행 헬퍼 (Quick Start)."""
 from pathlib import Path
 from typing import Optional
@@ -20,49 +20,49 @@ def start_training(
     seq_len: int = 2048,
     auto_config: bool = True,
 ) -> Trainer:
-    """학습을 시작합니다 (한 줄 실행).
-    사용법 (Colab):
     ```python
     from model import LLMModel, ModelConfig
     from data_pipeline import setup_data_pipeline, DataConfig
     from trainer import start_training, TrainConfig
-    # 1. 모델 생성
     model_config = ModelConfig.base_1b()
     model = LLMModel(model_config)
-    # 2. 데이터 파이프라인
     tok, train_dl, val_dl = setup_data_pipeline("pretrained")
-    # 3. 학습 시작 (체크포인트 자동 복원)
     trainer = start_training(model, train_dl, val_dl)
     ```
     """
     config = config or TrainConfig()
-    # GPU 자동 감지 및 설정 조정
     if auto_config:
         config = auto_configure(config)
-    # Google Drive 마운트 확인 (Colab)
     if "/content/drive" in config.checkpoint_dir:
         drive_path = Path("/content/drive/MyDrive")
         if not drive_path.exists():
-            print("\n⚠️ Google Drive가 마운트되지 않았습니다!")
-            print("  Colab에서 실행: from google.colab import drive; drive.mount('/content/drive')")
-            print("  로컬 경로로 변경합니다.")
             config.checkpoint_dir = "./checkpoints"
-    # 재현성 시드 설정
     torch.manual_seed(config.seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(config.seed)
-    # Trainer 생성 (체크포인트 자동 복원 포함)
     trainer = Trainer(model, train_dataloader, val_dataloader, config, seq_len)
-    # 학습 실행
     trainer.train()
     return trainer

+"""Training execution helper (Quick Start)."""
 from pathlib import Path
 from typing import Optional
     seq_len: int = 2048,
     auto_config: bool = True,
 ) -> Trainer:
+    """Starts training (one-line execution).
+    Usage (Colab):
     ```python
     from model import LLMModel, ModelConfig
     from data_pipeline import setup_data_pipeline, DataConfig
     from trainer import start_training, TrainConfig
+    # 1. Create model
     model_config = ModelConfig.base_1b()
     model = LLMModel(model_config)
+    # 2. Data pipeline
     tok, train_dl, val_dl = setup_data_pipeline("pretrained")
+    # 3. Start training (automatic checkpoint restoration)
     trainer = start_training(model, train_dl, val_dl)
     ```
     """
     config = config or TrainConfig()
+    # Auto-detect GPU and adjust configuration
     if auto_config:
         config = auto_configure(config)
+    # Check Google Drive mount (Colab)
     if "/content/drive" in config.checkpoint_dir:
         drive_path = Path("/content/drive/MyDrive")
         if not drive_path.exists():
+            print("\n⚠️ Google Drive is not mounted!")
+            print("  Run in Colab: from google.colab import drive; drive.mount('/content/drive')")
+            print("  Switching to local path.")
             config.checkpoint_dir = "./checkpoints"
+    # Set reproducibility seed
     torch.manual_seed(config.seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(config.seed)
+    # Create Trainer (includes automatic checkpoint restoration)
     trainer = Trainer(model, train_dataloader, val_dataloader, config, seq_len)
+    # Run training
     trainer.train()
     return trainer

llm_lab/training/scheduler.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Cosine Annealing with Linear Warmup 스케줄러."""
 import math
@@ -10,22 +10,22 @@ from llm_lab.config import TrainConfig
 class CosineWarmupScheduler:
     """Cosine Annealing with Linear Warmup.
-    LR 곡선:
       ┌─── peak_lr ───────╲
       │                     ╲  cosine decay
       │ warmup (linear)      ╲
       │/                       ╲_______ min_lr
       └──────────────────────────────────→ steps
-    왜 Cosine Decay인가?
-      - Step decay: 갑작스러운 LR 하락 → Loss 불안정
-      - Linear decay: 후반부 LR이 너무 빨리 감소
-      - Cosine: 부드러운 감소, 학습 후반에도 적절한 LR 유지
-      - GPT-3, LLaMA, Chinchilla 등 대부분의 LLM이 사용
-    구현 참고:
-      PyTorch 내장 스케줄러(CosineAnnealingLR 등)도 있지만,
-      warmup + min_lr + 체크포인트 복원을 위해 직접 구현이 더 유연합니다.
     """
     def __init__(self, config: TrainConfig):
@@ -35,33 +35,33 @@ class CosineWarmupScheduler:
         self.total_steps = config.total_steps
     def get_lr(self, step: int) -> float:
-        """현재 step에 해당하는 학습률을 반환합니다.
         Args:
-            step: 현재 optimizer step (0-indexed)
         Returns:
-            학습률 (float)
         """
         # Phase 1: Linear Warmup
         if step < self.warmup_steps:
-            # 0 → peak_lr 선형 증가
             return self.peak_lr * (step / self.warmup_steps)
         # Phase 2: Cosine Decay
-        # warmup 이후 남은 진행률 (0.0 → 1.0)
         decay_steps = self.total_steps - self.warmup_steps
         progress = (step - self.warmup_steps) / max(decay_steps, 1)
-        progress = min(progress, 1.0)  # 안전장치
-        # Cosine 공식: min_lr + 0.5 × (peak - min) × (1 + cos(π × progress))
         cosine_decay = 0.5 * (1.0 + math.cos(math.pi * progress))
         lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
         return lr
     def set_lr(self, optimizer: torch.optim.Optimizer, step: int):
-        """Optimizer의 학습률을 업데이트합니다."""
         lr = self.get_lr(step)
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr

+"""Cosine Annealing with Linear Warmup scheduler."""
 import math
 class CosineWarmupScheduler:
     """Cosine Annealing with Linear Warmup.
+    LR curve:
       ┌─── peak_lr ───────╲
       │                     ╲  cosine decay
       │ warmup (linear)      ╲
       │/                       ╲_______ min_lr
       └──────────────────────────────────→ steps
+    Why Cosine Decay?
+      - Step decay: sudden LR drop → unstable loss
+      - Linear decay: LR decreases too quickly in the later stages
+      - Cosine: smooth decay, maintains appropriate LR even in the late training phase
+      - Used by most LLMs including GPT-3, LLaMA, and Chinchilla
+    Implementation note:
+      PyTorch has built-in schedulers (e.g., CosineAnnealingLR), but
+      a custom implementation is more flexible for warmup + min_lr + checkpoint restoration.
     """
     def __init__(self, config: TrainConfig):
         self.total_steps = config.total_steps
     def get_lr(self, step: int) -> float:
+        """Returns the learning rate for the current step.
         Args:
+            step: Current optimizer step (0-indexed)
         Returns:
+            Learning rate (float)
         """
         # Phase 1: Linear Warmup
         if step < self.warmup_steps:
+            # Linear increase from 0 to peak_lr
             return self.peak_lr * (step / self.warmup_steps)
         # Phase 2: Cosine Decay
+        # Progress ratio after warmup (0.0 → 1.0)
         decay_steps = self.total_steps - self.warmup_steps
         progress = (step - self.warmup_steps) / max(decay_steps, 1)
+        progress = min(progress, 1.0)  # safety clamp
+        # Cosine formula: min_lr + 0.5 × (peak - min) × (1 + cos(π × progress))
         cosine_decay = 0.5 * (1.0 + math.cos(math.pi * progress))
         lr = self.min_lr + (self.peak_lr - self.min_lr) * cosine_decay
         return lr
     def set_lr(self, optimizer: torch.optim.Optimizer, step: int):
+        """Updates the learning rate of the optimizer."""
         lr = self.get_lr(step)
         for param_group in optimizer.param_groups:
             param_group["lr"] = lr

llm_lab/training/trainer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""LLM 사전학습 트레이너."""
 import math
 import time
@@ -16,9 +16,9 @@ from .optimizer import create_optimizer
 class Trainer:
-    """LLM 사전학습 트레이너.
-    학습 루프의 핵심 구조:
     ```
     for step in range(total_steps):
         # ── Gradient Accumulation Loop ──
@@ -27,22 +27,22 @@ class Trainer:
             with autocast(bf16):
                 logits, loss = model(input_ids, targets)
             scaled_loss = loss / accumulation_steps
-            scaled_loss.backward()          # gradient 누적
-        # ── Optimizer Step (accumulation 완료 후) ──
         clip_grad_norm(model, max_norm=1.0)
         optimizer.step()
         optimizer.zero_grad()
         scheduler.set_lr(optimizer, step)
     ```
-    Gradient Accumulation이란?
-      - GPU 메모리에 큰 배치를 한 번에 올릴 수 없을 때
-      - 작은 micro_batch로 여러 번 forward/backward → gradient를 누적
-      - 누적 후 한 번에 optimizer step
-      - 결과적으로 큰 effective_batch와 동일한 효과
-      - Loss를 accumulation_steps로 나누는 이유:
-        gradient의 평균을 구하기 위해 (합이 아닌 평균)
     """
     def __init__(
@@ -56,52 +56,52 @@ class Trainer:
         self.config = config
         self.seq_len = seq_len
-        # ── 디바이스 설정 ──
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"[Trainer] 디바이스: {self.device}")
         if torch.cuda.is_available():
             print(f"[Trainer] GPU: {torch.cuda.get_device_name()}")
-            print(f"[Trainer] GPU 메모리: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
-        # ── 모델 ──
         self.model = model.to(self.device)
-        # torch.compile: PyTorch 2.0+ 그래프 최적화 (속도 10-30% 향상)
         if torch.cuda.is_available() and hasattr(torch, "compile"):
-            print("[Trainer] torch.compile 적용 중...")
             self.model = torch.compile(self.model)
-        # ── 데이터 ──
         self.train_dataloader = train_dataloader
         self.val_dataloader = val_dataloader
         self.train_iter = iter(train_dataloader)
-        # ── 옵티마이저 ──
         self.optimizer = create_optimizer(self.model, config)
-        # ── 스케줄러 ──
         self.scheduler = CosineWarmupScheduler(config)
-        # ── 체크포인트 ──
         self.ckpt_manager = CheckpointManager(config)
-        # ── 메트릭 ──
         self.metrics = MetricsTracker(config)
-        # ── 학습 상태 ──
         self.global_step = 0
         self.best_val_loss = float("inf")
         self.tokens_seen = 0
         # ── Mixed Precision ──
-        # bf16은 GradScaler가 불필요 (fp16일 때만 필요)
         self.use_amp = config.dtype != "float32"
         self.amp_dtype = config.torch_dtype
-        # ── 자동 복원 시도 ──
         self._try_resume()
     def _try_resume(self):
-        """이전 체크포인트가 있으면 자동으로 복원합니다."""
         result = self.ckpt_manager.load_latest(
             self.model, self.optimizer, self.device
         )
@@ -111,20 +111,20 @@ class Trainer:
             self.best_val_loss = result["best_val_loss"]
             self.metrics.history = result.get("metrics_history", self.metrics.history)
-            # wandb 연속 로깅
             if result.get("wandb_run_id"):
                 self.metrics.resume_wandb(result["wandb_run_id"])
             self.tokens_seen = self.global_step * self.config.effective_batch_size * self.seq_len
-            print(f"[Trainer] 학습 재개: step={self.global_step}, "
                   f"tokens={self.tokens_seen/1e9:.2f}B, "
                   f"best_val_loss={self.best_val_loss:.4f}")
     def _get_next_batch(self) -> Dict[str, torch.Tensor]:
-        """다음 학습 배치를 가져옵니다.
-        Streaming DataLoader는 에폭 개념이 없으므로,
-        StopIteration 시 새 이터레이터를 생성합니다.
         """
         try:
             batch = next(self.train_iter)
@@ -138,14 +138,14 @@ class Trainer:
         }
     def _train_step(self) -> Tuple[float, float]:
-        """하나의 optimizer step을 수행합니다.
         Returns:
             (loss, grad_norm)
         """
         self.model.train()
         self.optimizer.zero_grad(set_to_none=True)
-        # set_to_none=True: gradient를 None으로 설정 → 메모리 절약
         total_loss = 0.0
@@ -157,16 +157,16 @@ class Trainer:
             with torch.amp.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
                 logits, loss = self.model(batch["input_ids"], batch["targets"])
-            # Loss 스케일링: effective batch의 평균을 위해
             scaled_loss = loss / self.config.gradient_accumulation_steps
             total_loss += loss.item()
-            # Backward (gradient 누적)
             scaled_loss.backward()
         # ── Gradient Clipping ──
-        # 모든 파라미터의 gradient를 하나의 벡터로 보고 L2 norm 계산
-        # norm이 max_norm을 초과하면 비례적으로 스케일 다운
         grad_norm = torch.nn.utils.clip_grad_norm_(
             self.model.parameters(),
             max_norm=self.config.grad_clip,
@@ -175,7 +175,7 @@ class Trainer:
         # ── Optimizer Step ──
         self.optimizer.step()
-        # ── LR 업데이트 ──
         self.scheduler.set_lr(self.optimizer, self.global_step)
         avg_loss = total_loss / self.config.gradient_accumulation_steps
@@ -183,13 +183,13 @@ class Trainer:
     @torch.no_grad()
     def _evaluate(self) -> Tuple[float, float]:
-        """검증 데이터에서 Loss와 Perplexity를 측정합니다.
         Perplexity = exp(loss)
-          - 직관: "모델이 다음 토큰을 평균 몇 개의 후보 중에서 고르는가"
-          - PPL 100 → 100개 중 1개를 균일하게 고르는 수준
-          - PPL 20  → 20개 중 1개 수준 (꽤 좋음)
-          - PPL 10  → 매우 자신있게 예측
         """
         if self.val_dataloader is None:
             return float("inf"), float("inf")
@@ -212,36 +212,37 @@ class Trainer:
             num_batches += 1
         avg_loss = total_loss / max(num_batches, 1)
-        perplexity = math.exp(min(avg_loss, 20))  # overflow 방지 (exp(20) ≈ 5억)
         return avg_loss, perplexity
     def train(self):
-        """메인 학습 루프.
-        이 메서드가 전체 학습을 실행합니다.
-        Colab 세션 만료 시 중단되어도 체크포인트에서 자동 재개됩니다.
         """
         config = self.config
         print("\n" + "=" * 70)
-        print("🚀 학습 시작")
         print("=" * 70)
-        print(f"  총 스텝: {config.total_steps:,}")
-        print(f"  시작 스텝: {self.global_step}")
         print(f"  Effective batch size: {config.effective_batch_size}")
-        print(f"  토큰/스텝: {config.effective_batch_size * self.seq_len:,}")
-        print(f"  총 학습 토큰 (예상): {config.total_steps * config.effective_batch_size * self.seq_len / 1e9:.1f}B")
         print(f"  Mixed Precision: {config.dtype}")
         print(f"  Gradient Accumulation: {config.gradient_accumulation_steps}")
-        print(f"  체크포인트: {config.checkpoint_dir}")
         print("=" * 70 + "\n")
         step_start_time = time.time()
         tokens_at_log_start = self.tokens_seen
         # ════════════════════════════════════════════
-        # 메인 루프
         # ════════════════════════════════════════════
         while self.global_step < config.total_steps:
@@ -257,21 +258,21 @@ class Trainer:
                 tokens_delta = self.tokens_seen - tokens_at_log_start
                 tokens_per_sec = tokens_delta / max(elapsed, 1e-6)
-                # GPU 메모리
                 gpu_mem_gb = 0.0
                 if torch.cuda.is_available():
                     gpu_mem_gb = torch.cuda.max_memory_allocated() / 1e9
-                # 현재 LR
                 current_lr = self.scheduler.get_lr(self.global_step)
-                # 남은 시간 추정
                 remaining_steps = config.total_steps - self.global_step
                 steps_per_sec = config.log_interval / max(elapsed, 1e-6)
                 eta_seconds = remaining_steps / max(steps_per_sec, 1e-6)
                 eta_hours = eta_seconds / 3600
-                # 콘솔 출력
                 print(
                     f"  Step {self.global_step:>6d}/{config.total_steps} │ "
                     f"Loss {loss:.4f} │ "
@@ -283,7 +284,7 @@ class Trainer:
                     f"Tokens {self.tokens_seen/1e9:.2f}B"
                 )
-                # wandb 로깅
                 self.metrics.log_train_step(
                     step=self.global_step,
                     loss=loss,
@@ -324,19 +325,19 @@ class Trainer:
                 )
         # ════════════════════════════════════════════
-        # 학습 완료
         # ════════════════════════════════════════════
         print("\n" + "=" * 70)
-        print("🎉 학습 완료!")
         print("=" * 70)
-        print(f"  총 스텝: {self.global_step:,}")
-        print(f"  총 토큰: {self.tokens_seen/1e9:.2f}B")
-        print(f"  최저 Val Loss: {self.best_val_loss:.4f}")
-        print(f"  최저 Val PPL: {math.exp(min(self.best_val_loss, 20)):.2f}")
         print("=" * 70)
-        # 최종 체크포인트 저장
         self.ckpt_manager.save(
             model=self.model,
             optimizer=self.optimizer,

+"""LLM pretraining trainer."""
 import math
 import time
 class Trainer:
+    """LLM pretraining trainer.
+    Core structure of the training loop:
     ```
     for step in range(total_steps):
         # ── Gradient Accumulation Loop ──
             with autocast(bf16):
                 logits, loss = model(input_ids, targets)
             scaled_loss = loss / accumulation_steps
+            scaled_loss.backward()          # accumulate gradients
+        # ── Optimizer Step (after accumulation completes) ──
         clip_grad_norm(model, max_norm=1.0)
         optimizer.step()
         optimizer.zero_grad()
         scheduler.set_lr(optimizer, step)
     ```
+    What is Gradient Accumulation?
+      - Used when a large batch cannot fit into GPU memory all at once
+      - Run forward/backward multiple times with small micro_batches → accumulate gradients
+      - Perform optimizer step once after accumulation is complete
+      - Effectively equivalent to training with a large effective batch size
+      - Reason for dividing loss by accumulation_steps:
+        to compute the mean of gradients (average, not sum)
     """
     def __init__(
         self.config = config
         self.seq_len = seq_len
+        # ── Device setup ──
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"[Trainer] Device: {self.device}")
         if torch.cuda.is_available():
             print(f"[Trainer] GPU: {torch.cuda.get_device_name()}")
+            print(f"[Trainer] GPU Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
+        # ── Model ──
         self.model = model.to(self.device)
+        # torch.compile: PyTorch 2.0+ graph optimization (10-30% speed improvement)
         if torch.cuda.is_available() and hasattr(torch, "compile"):
+            print("[Trainer] Applying torch.compile...")
             self.model = torch.compile(self.model)
+        # ── Data ──
         self.train_dataloader = train_dataloader
         self.val_dataloader = val_dataloader
         self.train_iter = iter(train_dataloader)
+        # ── Optimizer ──
         self.optimizer = create_optimizer(self.model, config)
+        # ── Scheduler ──
         self.scheduler = CosineWarmupScheduler(config)
+        # ── Checkpoint ──
         self.ckpt_manager = CheckpointManager(config)
+        # ── Metrics ──
         self.metrics = MetricsTracker(config)
+        # ── Training state ──
         self.global_step = 0
         self.best_val_loss = float("inf")
         self.tokens_seen = 0
         # ── Mixed Precision ──
+        # bf16 does not require GradScaler (only needed for fp16)
         self.use_amp = config.dtype != "float32"
         self.amp_dtype = config.torch_dtype
+        # ── Attempt automatic resume ──
         self._try_resume()
     def _try_resume(self):
+        """Automatically restores from a previous checkpoint if one exists."""
         result = self.ckpt_manager.load_latest(
             self.model, self.optimizer, self.device
         )
             self.best_val_loss = result["best_val_loss"]
             self.metrics.history = result.get("metrics_history", self.metrics.history)
+            # Resume wandb logging continuously
             if result.get("wandb_run_id"):
                 self.metrics.resume_wandb(result["wandb_run_id"])
             self.tokens_seen = self.global_step * self.config.effective_batch_size * self.seq_len
+            print(f"[Trainer] Resuming training: step={self.global_step}, "
                   f"tokens={self.tokens_seen/1e9:.2f}B, "
                   f"best_val_loss={self.best_val_loss:.4f}")
     def _get_next_batch(self) -> Dict[str, torch.Tensor]:
+        """Fetches the next training batch.
+        Since a Streaming DataLoader has no epoch concept,
+        a new iterator is created when StopIteration is raised.
         """
         try:
             batch = next(self.train_iter)
         }
     def _train_step(self) -> Tuple[float, float]:
+        """Performs one optimizer step.
         Returns:
             (loss, grad_norm)
         """
         self.model.train()
         self.optimizer.zero_grad(set_to_none=True)
+        # set_to_none=True: sets gradients to None → saves memory
         total_loss = 0.0
             with torch.amp.autocast(device_type="cuda", dtype=self.amp_dtype, enabled=self.use_amp):
                 logits, loss = self.model(batch["input_ids"], batch["targets"])
+            # Loss scaling: to compute the mean over the effective batch
             scaled_loss = loss / self.config.gradient_accumulation_steps
             total_loss += loss.item()
+            # Backward (accumulate gradients)
             scaled_loss.backward()
         # ── Gradient Clipping ──
+        # Treat all parameter gradients as a single vector and compute L2 norm
+        # If norm exceeds max_norm, scale down proportionally
         grad_norm = torch.nn.utils.clip_grad_norm_(
             self.model.parameters(),
             max_norm=self.config.grad_clip,
         # ── Optimizer Step ──
         self.optimizer.step()
+        # ── LR Update ──
         self.scheduler.set_lr(self.optimizer, self.global_step)
         avg_loss = total_loss / self.config.gradient_accumulation_steps
     @torch.no_grad()
     def _evaluate(self) -> Tuple[float, float]:
+        """Measures Loss and Perplexity on the validation data.
         Perplexity = exp(loss)
+          - Intuition: "how many candidates does the model choose the next token from on average"
+          - PPL 100 → equivalent to uniformly choosing 1 out of 100
+          - PPL 20  → 1 out of 20 (fairly good)
+          - PPL 10  → predicting with high confidence
         """
         if self.val_dataloader is None:
             return float("inf"), float("inf")
             num_batches += 1
         avg_loss = total_loss / max(num_batches, 1)
+        perplexity = math.exp(min(avg_loss, 20))  # prevent overflow (exp(20) ≈ 500M)
         return avg_loss, perplexity
     def train(self):
+        """Main training loop.
+        This method runs the entire training process.
+        Even if interrupted by a Colab session expiry,
+        training will automatically resume from the last checkpoint.
         """
         config = self.config
         print("\n" + "=" * 70)
+        print("🚀 Training started")
         print("=" * 70)
+        print(f"  Total steps: {config.total_steps:,}")
+        print(f"  Start step: {self.global_step}")
         print(f"  Effective batch size: {config.effective_batch_size}")
+        print(f"  Tokens/step: {config.effective_batch_size * self.seq_len:,}")
+        print(f"  Total training tokens (estimated): {config.total_steps * config.effective_batch_size * self.seq_len / 1e9:.1f}B")
         print(f"  Mixed Precision: {config.dtype}")
         print(f"  Gradient Accumulation: {config.gradient_accumulation_steps}")
+        print(f"  Checkpoint: {config.checkpoint_dir}")
         print("=" * 70 + "\n")
         step_start_time = time.time()
         tokens_at_log_start = self.tokens_seen
         # ════════════════════════════════════════════
+        # Main loop
         # ════════════════════════════════════════════
         while self.global_step < config.total_steps:
                 tokens_delta = self.tokens_seen - tokens_at_log_start
                 tokens_per_sec = tokens_delta / max(elapsed, 1e-6)
+                # GPU memory
                 gpu_mem_gb = 0.0
                 if torch.cuda.is_available():
                     gpu_mem_gb = torch.cuda.max_memory_allocated() / 1e9
+                # Current LR
                 current_lr = self.scheduler.get_lr(self.global_step)
+                # Estimate remaining time
                 remaining_steps = config.total_steps - self.global_step
                 steps_per_sec = config.log_interval / max(elapsed, 1e-6)
                 eta_seconds = remaining_steps / max(steps_per_sec, 1e-6)
                 eta_hours = eta_seconds / 3600
+                # Console output
                 print(
                     f"  Step {self.global_step:>6d}/{config.total_steps} │ "
                     f"Loss {loss:.4f} │ "
                     f"Tokens {self.tokens_seen/1e9:.2f}B"
                 )
+                # wandb logging
                 self.metrics.log_train_step(
                     step=self.global_step,
                     loss=loss,
                 )
         # ════════════════════════════════════════════
+        # Training complete
         # ════════════════════════════════════════════
         print("\n" + "=" * 70)
+        print("🎉 Training complete!")
         print("=" * 70)
+        print(f"  Total steps: {self.global_step:,}")
+        print(f"  Total tokens: {self.tokens_seen/1e9:.2f}B")
+        print(f"  Best Val Loss: {self.best_val_loss:.4f}")
+        print(f"  Best Val PPL: {math.exp(min(self.best_val_loss, 20)):.2f}")
         print("=" * 70)
+        # Save final checkpoint
         self.ckpt_manager.save(
             model=self.model,
             optimizer=self.optimizer,

llm_lab/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""공통 유틸리티 — 디바이스 감지, 시드 설정."""
 from .device import get_device, detect_gpu_info, auto_configure
 from .seed import set_seed

+"""Common utilities — device detection, seed configuration."""
 from .device import get_device, detect_gpu_info, auto_configure
 from .seed import set_seed

llm_lab/utils/device.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""디바이스 감지 및 자동 설정 유틸리티."""
 from __future__ import annotations
 from typing import TYPE_CHECKING
@@ -10,15 +10,15 @@ if TYPE_CHECKING:
 def get_device() -> torch.device:
-    """사용 가능한 디바이스(cuda 또는 cpu)를 반환합니다."""
     return torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def detect_gpu_info() -> dict:
-    """GPU 이름과 메모리 정보를 반환합니다.
     Returns:
-        {"name": str, "memory_gb": float} 또는 GPU가 없으면 빈 dict
     """
     if not torch.cuda.is_available():
         return {}
@@ -29,16 +29,16 @@ def detect_gpu_info() -> dict:
 def auto_configure(config: "TrainConfig") -> "TrainConfig":
-    """GPU 종류에 따라 설정을 자동 조정합니다.
-    Colab Pro+에서 A100이 항상 배정되지는 않습니다.
-    T4나 V100이 배정될 경우 자동으로 설정을 조정합니다.
     Returns:
-        조정된 TrainConfig
     """
     if not torch.cuda.is_available():
-        print("⚠️ GPU 없음! CPU 모드 (매우 느림)")
         config.dtype = "float32"
         config.micro_batch_size = 1
         config.gradient_accumulation_steps = 4
@@ -47,37 +47,37 @@ def auto_configure(config: "TrainConfig") -> "TrainConfig":
     gpu_name = torch.cuda.get_device_name().lower()
     gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9
-    print(f"\n🔍 GPU 감지: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")
     if "a100" in gpu_name:
-        # A100 40GB: 기본 설정 그대로 (최적)
-        print("  → A100 감지: 기본 설정 사용 (bf16, batch=4)")
         config.dtype = "bfloat16"
         config.micro_batch_size = 4
     elif "v100" in gpu_name:
-        # V100 16GB: bf16 미지원, 배치 축소
-        print("  → V100 감지: fp16 모드, 배치 축소")
         config.dtype = "float16"
         config.micro_batch_size = 2
-        config.gradient_accumulation_steps = 64  # effective batch 유지
     elif "t4" in gpu_name:
-        # T4 16GB: bf16 미지원, 더 작은 배치
-        print("  → T4 감지: fp16 모드, 최소 배치")
         config.dtype = "float16"
         config.micro_batch_size = 1
         config.gradient_accumulation_steps = 128
     elif "l4" in gpu_name:
-        # L4 24GB: bf16 지원
-        print("  → L4 감지: bf16 모드, 배치 조정")
         config.dtype = "bfloat16"
         config.micro_batch_size = 2
         config.gradient_accumulation_steps = 64
     else:
-        print(f"  → 알 수 없는 GPU. 메모리 기준으로 설정 조정")
         if gpu_mem >= 30:
             config.micro_batch_size = 4
         elif gpu_mem >= 16:

+"""Device detection and auto-configuration utilities."""
 from __future__ import annotations
 from typing import TYPE_CHECKING
 def get_device() -> torch.device:
+    """Returns the available device (cuda or cpu)."""
     return torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def detect_gpu_info() -> dict:
+    """Returns GPU name and memory information.
     Returns:
+        {"name": str, "memory_gb": float} or an empty dict if no GPU is available
     """
     if not torch.cuda.is_available():
         return {}
 def auto_configure(config: "TrainConfig") -> "TrainConfig":
+    """Automatically adjusts configuration based on GPU type.
+    In Colab Pro+, an A100 is not always assigned.
+    If a T4 or V100 is assigned, configuration is automatically adjusted.
     Returns:
+        Adjusted TrainConfig
     """
     if not torch.cuda.is_available():
+        print("⚠️ No GPU found! Running in CPU mode (very slow)")
         config.dtype = "float32"
         config.micro_batch_size = 1
         config.gradient_accumulation_steps = 4
     gpu_name = torch.cuda.get_device_name().lower()
     gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9
+    print(f"\n🔍 GPU detected: {torch.cuda.get_device_name()} ({gpu_mem:.1f} GB)")
     if "a100" in gpu_name:
+        # A100 40GB: use default settings (optimal)
+        print("  → A100 detected: using default settings (bf16, batch=4)")
         config.dtype = "bfloat16"
         config.micro_batch_size = 4
     elif "v100" in gpu_name:
+        # V100 16GB: bf16 not supported, reduce batch size
+        print("  → V100 detected: fp16 mode, reduced batch size")
         config.dtype = "float16"
         config.micro_batch_size = 2
+        config.gradient_accumulation_steps = 64  # maintain effective batch size
     elif "t4" in gpu_name:
+        # T4 16GB: bf16 not supported, smaller batch
+        print("  → T4 detected: fp16 mode, minimum batch size")
         config.dtype = "float16"
         config.micro_batch_size = 1
         config.gradient_accumulation_steps = 128
     elif "l4" in gpu_name:
+        # L4 24GB: bf16 supported
+        print("  → L4 detected: bf16 mode, adjusted batch size")
         config.dtype = "bfloat16"
         config.micro_batch_size = 2
         config.gradient_accumulation_steps = 64
     else:
+        print(f"  → Unknown GPU. Adjusting settings based on memory")
         if gpu_mem >= 30:
             config.micro_batch_size = 4
         elif gpu_mem >= 16:

llm_lab/utils/seed.py CHANGED Viewed

@@ -1,9 +1,9 @@
-"""재현성을 위한 시드 유틸리티."""
 import torch
 def set_seed(seed: int = 42):
-    """재현성을 위한 시드 설정."""
     torch.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)

+"""Seed utility for reproducibility."""
 import torch
 def set_seed(seed: int = 42):
+    """Set seed for reproducibility."""
     torch.manual_seed(seed)
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)