| """ | |
| Nano-GPT Configuration | |
| L4-SAFE: Reduced memory usage | |
| """ | |
| import torch | |
| from dataclasses import dataclass | |
| class NanoGPTConfig: | |
| # Model Architecture | |
| vocab_size: int = 32000 | |
| n_layers: int = 8 # REDUCED from 12 | |
| n_heads: int = 8 # REDUCED from 12 | |
| n_embd: int = 512 # REDUCED from 768 | |
| block_size: int = 512 # REDUCED from 1024 (KEY!) | |
| dropout: float = 0.1 | |
| bias: bool = True | |
| # Training Hyperparameters | |
| batch_size: int = 16 # REDUCED from 32 (KEY!) | |
| gradient_accumulation_steps: int = 8 # INCREASED from 4 | |
| learning_rate: float = 3e-4 | |
| max_iters: int = 100000 | |
| weight_decay: float = 0.1 | |
| beta1: float = 0.9 | |
| beta2: float = 0.95 | |
| grad_clip: float = 1.0 | |
| # Learning Rate Scheduling | |
| decay_lr: bool = True | |
| warmup_iters: int = 2000 | |
| lr_decay_iters: int = 100000 | |
| min_lr: float = 3e-5 | |
| # Evaluation & Logging | |
| eval_interval: int = 1000 | |
| eval_iters: int = 100 # REDUCED from 200 | |
| log_interval: int = 100 | |
| # Checkpointing | |
| save_interval: int = 5000 | |
| checkpoint_dir: str = "checkpoints" | |
| # Data | |
| dataset_mix: dict = None | |
| # Hardware | |
| device: str = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| dtype: str = 'bfloat16' | |
| compile: bool = False # DISABLED torch.compile (uses more memory!) | |
| # Reproducibility | |
| seed: int = 42 | |
| def __post_init__(self): | |
| if self.dataset_mix is None: | |
| self.dataset_mix = { | |
| 'fineweb': 1.0 | |
| } | |
| def n_params(self): | |
| return (2 * self.vocab_size * self.n_embd + | |
| 12 * self.n_layers * self.n_embd * self.n_embd) / 1e6 | |
| config = NanoGPTConfig() | |
| if __name__ == "__main__": | |
| print(f"Model size: ~{config.n_params:.1f}M parameters") | |
| print(f"Sequence length: {config.block_size}") | |
| print(f"Batch size: {config.batch_size}") | |
| print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}") |