Intellite / config.py
ProCreations's picture
Initial RLHF chat UI for intellite 100M
a2ce935
from dataclasses import dataclass
@dataclass
class ModelConfig:
vocab_size: int = 50304 # rounded-up GPT-2 vocab for better matmul shapes
d_model: int = 768
n_layers: int = 10
n_heads: int = 12 # head_dim = 64
d_ff: int = 2048 # canonical SwiGLU 8/3 * d_model
seq_len: int = 2048
dropout: float = 0.0
rope_theta: float = 10000.0
tie_embeddings: bool = True
norm_eps: float = 1e-5
@dataclass
class TrainConfig:
# Paths
data_dir: str = "data"
out_dir: str = "checkpoints"
# Model (mirrors ModelConfig so a single dataclass configures runs)
vocab_size: int = 50304
d_model: int = 768
n_layers: int = 10
n_heads: int = 12
d_ff: int = 2048
seq_len: int = 2048
dropout: float = 0.0
# Training budget
target_tokens: int = 1_000_000_000
# Memory at seq=2048 for ~100M params: keep microbatches small and use
# grad accumulation to keep effective batch = 32 × 2048 = 65_536 tok/step.
batch_size: int = 4
grad_accum_steps: int = 8
# Optimizer / schedule
learning_rate: float = 6e-4
min_lr_ratio: float = 0.1
warmup_tokens: int = 3_000_000
weight_decay: float = 0.1
beta1: float = 0.9
beta2: float = 0.95
grad_clip: float = 1.0
# Checkpoint / eval cadence (in tokens)
ckpt_every_tokens: int = 100_000_000
eval_every_tokens: int = 6_000_000
eval_batches: int = 50
# Logging
log_every_steps: int = 10
# System
device: str = "mps"
dtype: str = "bfloat16"
seed: int = 1337