Arko007's picture
Upload config.py with huggingface_hub
9f66630 verified
"""
Nano-GPT Configuration
L4-SAFE: Reduced memory usage
"""
import torch
from dataclasses import dataclass
@dataclass
class NanoGPTConfig:
# Model Architecture
vocab_size: int = 32000
n_layers: int = 8 # REDUCED from 12
n_heads: int = 8 # REDUCED from 12
n_embd: int = 512 # REDUCED from 768
block_size: int = 512 # REDUCED from 1024 (KEY!)
dropout: float = 0.1
bias: bool = True
# Training Hyperparameters
batch_size: int = 16 # REDUCED from 32 (KEY!)
gradient_accumulation_steps: int = 8 # INCREASED from 4
learning_rate: float = 3e-4
max_iters: int = 100000
weight_decay: float = 0.1
beta1: float = 0.9
beta2: float = 0.95
grad_clip: float = 1.0
# Learning Rate Scheduling
decay_lr: bool = True
warmup_iters: int = 2000
lr_decay_iters: int = 100000
min_lr: float = 3e-5
# Evaluation & Logging
eval_interval: int = 1000
eval_iters: int = 100 # REDUCED from 200
log_interval: int = 100
# Checkpointing
save_interval: int = 5000
checkpoint_dir: str = "checkpoints"
# Data
dataset_mix: dict = None
# Hardware
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype: str = 'bfloat16'
compile: bool = False # DISABLED torch.compile (uses more memory!)
# Reproducibility
seed: int = 42
def __post_init__(self):
if self.dataset_mix is None:
self.dataset_mix = {
'fineweb': 1.0
}
@property
def n_params(self):
return (2 * self.vocab_size * self.n_embd +
12 * self.n_layers * self.n_embd * self.n_embd) / 1e6
config = NanoGPTConfig()
if __name__ == "__main__":
print(f"Model size: ~{config.n_params:.1f}M parameters")
print(f"Sequence length: {config.block_size}")
print(f"Batch size: {config.batch_size}")
print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}")