Spaces:

ProCreations
/

Intellite

Running

Intellite / config.py

Initial RLHF chat UI for intellite 100M

a2ce935 22 days ago

1.57 kB

	from dataclasses import dataclass


	@dataclass
	class ModelConfig:
	vocab_size: int = 50304 # rounded-up GPT-2 vocab for better matmul shapes
	d_model: int = 768
	n_layers: int = 10
	n_heads: int = 12 # head_dim = 64
	d_ff: int = 2048 # canonical SwiGLU 8/3 * d_model
	seq_len: int = 2048
	dropout: float = 0.0
	rope_theta: float = 10000.0
	tie_embeddings: bool = True
	norm_eps: float = 1e-5


	@dataclass
	class TrainConfig:
	# Paths
	data_dir: str = "data"
	out_dir: str = "checkpoints"

	# Model (mirrors ModelConfig so a single dataclass configures runs)
	vocab_size: int = 50304
	d_model: int = 768
	n_layers: int = 10
	n_heads: int = 12
	d_ff: int = 2048
	seq_len: int = 2048
	dropout: float = 0.0

	# Training budget
	target_tokens: int = 1_000_000_000
	# Memory at seq=2048 for ~100M params: keep microbatches small and use
	# grad accumulation to keep effective batch = 32 × 2048 = 65_536 tok/step.
	batch_size: int = 4
	grad_accum_steps: int = 8

	# Optimizer / schedule
	learning_rate: float = 6e-4
	min_lr_ratio: float = 0.1
	warmup_tokens: int = 3_000_000
	weight_decay: float = 0.1
	beta1: float = 0.9
	beta2: float = 0.95
	grad_clip: float = 1.0

	# Checkpoint / eval cadence (in tokens)
	ckpt_every_tokens: int = 100_000_000
	eval_every_tokens: int = 6_000_000
	eval_batches: int = 50

	# Logging
	log_every_steps: int = 10

	# System
	device: str = "mps"
	dtype: str = "bfloat16"
	seed: int = 1337