""" Nano-GPT Configuration L4-SAFE: Reduced memory usage """ import torch from dataclasses import dataclass @dataclass class NanoGPTConfig: # Model Architecture vocab_size: int = 32000 n_layers: int = 8 # REDUCED from 12 n_heads: int = 8 # REDUCED from 12 n_embd: int = 512 # REDUCED from 768 block_size: int = 512 # REDUCED from 1024 (KEY!) dropout: float = 0.1 bias: bool = True # Training Hyperparameters batch_size: int = 16 # REDUCED from 32 (KEY!) gradient_accumulation_steps: int = 8 # INCREASED from 4 learning_rate: float = 3e-4 max_iters: int = 100000 weight_decay: float = 0.1 beta1: float = 0.9 beta2: float = 0.95 grad_clip: float = 1.0 # Learning Rate Scheduling decay_lr: bool = True warmup_iters: int = 2000 lr_decay_iters: int = 100000 min_lr: float = 3e-5 # Evaluation & Logging eval_interval: int = 1000 eval_iters: int = 100 # REDUCED from 200 log_interval: int = 100 # Checkpointing save_interval: int = 5000 checkpoint_dir: str = "checkpoints" # Data dataset_mix: dict = None # Hardware device: str = 'cuda' if torch.cuda.is_available() else 'cpu' dtype: str = 'bfloat16' compile: bool = False # DISABLED torch.compile (uses more memory!) # Reproducibility seed: int = 42 def __post_init__(self): if self.dataset_mix is None: self.dataset_mix = { 'fineweb': 1.0 } @property def n_params(self): return (2 * self.vocab_size * self.n_embd + 12 * self.n_layers * self.n_embd * self.n_embd) / 1e6 config = NanoGPTConfig() if __name__ == "__main__": print(f"Model size: ~{config.n_params:.1f}M parameters") print(f"Sequence length: {config.block_size}") print(f"Batch size: {config.batch_size}") print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}") @dataclass class FinetuneConfig(NanoGPTConfig): """Config for instruction-tuned models""" finetune_lr: float = 1e-4 finetune_epochs: int = 3