File size: 2,045 Bytes

9f66630

"""
Nano-GPT Configuration  
L4-SAFE: Reduced memory usage
"""

import torch
from dataclasses import dataclass

@dataclass
class NanoGPTConfig:
    # Model Architecture
    vocab_size: int = 32000
    n_layers: int = 8          # REDUCED from 12
    n_heads: int = 8           # REDUCED from 12
    n_embd: int = 512          # REDUCED from 768
    block_size: int = 512      # REDUCED from 1024 (KEY!)
    dropout: float = 0.1
    bias: bool = True
    
    # Training Hyperparameters
    batch_size: int = 16       # REDUCED from 32 (KEY!)
    gradient_accumulation_steps: int = 8  # INCREASED from 4
    learning_rate: float = 3e-4
    max_iters: int = 100000
    weight_decay: float = 0.1
    beta1: float = 0.9
    beta2: float = 0.95
    grad_clip: float = 1.0
    
    # Learning Rate Scheduling
    decay_lr: bool = True
    warmup_iters: int = 2000
    lr_decay_iters: int = 100000
    min_lr: float = 3e-5
    
    # Evaluation & Logging
    eval_interval: int = 1000
    eval_iters: int = 100      # REDUCED from 200
    log_interval: int = 100
    
    # Checkpointing
    save_interval: int = 5000
    checkpoint_dir: str = "checkpoints"
    
    # Data
    dataset_mix: dict = None
    
    # Hardware
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    dtype: str = 'bfloat16'
    compile: bool = False      # DISABLED torch.compile (uses more memory!)
    
    # Reproducibility
    seed: int = 42
    
    def __post_init__(self):
        if self.dataset_mix is None:
            self.dataset_mix = {
                'fineweb': 1.0
            }
    
    @property
    def n_params(self):
        return (2 * self.vocab_size * self.n_embd + 
                12 * self.n_layers * self.n_embd * self.n_embd) / 1e6

config = NanoGPTConfig()

if __name__ == "__main__":
    print(f"Model size: ~{config.n_params:.1f}M parameters")
    print(f"Sequence length: {config.block_size}")
    print(f"Batch size: {config.batch_size}")
    print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}")