RippleGPT-Nano / validation /qa /model_configs.py
Tavernari's picture
Upload folder using huggingface_hub
148b631 verified
"""
model_configs.py - Model configurations for Q&A Test with FineWeb-Edu.
Defines larger models (up to 350M) optimized for M2 Max with 64GB RAM.
Includes expanded vocabulary (32K-50K) for better efficiency.
"""
from dataclasses import dataclass
from typing import Dict
@dataclass
class QAModelConfig:
"""Configuration for a RippleGPT model for Q&A."""
name: str
n_layer: int
n_head: int
n_embd: int
block_size: int
dropout: float = 0.1
vocab_size: int = 32000 # Default BPE tokenizer
batch_size: int = 32 # Optimized for M2 Max
attention_window: int = None # RFC-001: Sliding window (None = full attention)
@property
def approx_params(self) -> str:
"""Rough parameter estimation."""
# Approximate formula: 12 * n_layer * n_embd^2
params = 12 * self.n_layer * (self.n_embd ** 2)
if params >= 1e9:
return f"{params/1e9:.1f}B"
elif params >= 1e6:
return f"{params/1e6:.0f}M"
else:
return f"{params/1e3:.0f}K"
@property
def estimated_ram_gb(self) -> float:
"""Estimated RAM usage in GB during training."""
# Approximate formula considering gradients and activations
params = 12 * self.n_layer * (self.n_embd ** 2)
# Model + gradients + activations ≈ 20x parameters
bytes_needed = params * 4 * 20 # float32
# Add batch memory
batch_mem = self.batch_size * self.block_size * self.n_embd * 4
return (bytes_needed + batch_mem) / 1e9
# ============================================================================
# MODEL CONFIGURATIONS - OPTIMIZED FOR M2 MAX (64GB)
# ============================================================================
# Small model for quick tests
SMALL_CONFIG = QAModelConfig(
name="small",
n_layer=8,
n_head=8,
n_embd=512,
block_size=512,
dropout=0.15,
vocab_size=32000,
batch_size=64
)
# Medium model for validation
MEDIUM_CONFIG = QAModelConfig(
name="medium",
n_layer=12,
n_head=12,
n_embd=768,
block_size=1024,
dropout=0.1,
vocab_size=32000,
batch_size=32
)
# 🎯 TARGET MODEL: ~150M params - FITS IN MPS 64GB!
# Reduced from 24 to 12 layers to fit in memory
# Uses sliding window attention for efficiency
# NOTE: n_embd=1056 (12×88) to be divisible by n_head
LARGE_CONFIG = QAModelConfig(
name="large",
n_layer=12, # Reduced to fit MPS memory
n_head=12, # Proportionally reduced
n_embd=1056, # 12 × 88 = 1056 (divisible!)
block_size=1024,
dropout=0.1,
vocab_size=32000,
batch_size=16,
attention_window=512 # Sliding window to save memory
)
# Extra large model (~300M, formerly large) - MAY OOM!
# Use only if you have 96GB+ RAM or dedicated GPU
XLARGE_CONFIG = QAModelConfig(
name="xlarge",
n_layer=24,
n_head=16,
n_embd=1024,
block_size=1024,
dropout=0.1,
vocab_size=32000,
batch_size=8, # Too small to try fitting
attention_window=256 # Very aggressive sliding window
)
# Mapping by name
CONFIGS: Dict[str, QAModelConfig] = {
"small": SMALL_CONFIG,
"medium": MEDIUM_CONFIG,
"large": LARGE_CONFIG,
"xlarge": XLARGE_CONFIG
}
def get_config(name: str) -> QAModelConfig:
"""Returns configuration by name."""
if name not in CONFIGS:
raise ValueError(f"Config '{name}' not found. Options: {list(CONFIGS.keys())}")
return CONFIGS[name]
def print_configs():
"""Prints all available configurations."""
print("\n📋 Model Configurations for Q&A (FineWeb-Edu):")
print("=" * 85)
print(f"{'Name':<10} {'Layers':<8} {'Heads':<8} {'Embd':<8} {'Block':<8} {'Vocab':<8} {'Batch':<8} {'~Params':<10} {'~RAM':<8}")
print("-" * 85)
for name, cfg in CONFIGS.items():
marker = "🎯" if name == "large" else " "
print(f"{marker}{cfg.name:<8} {cfg.n_layer:<8} {cfg.n_head:<8} {cfg.n_embd:<8} {cfg.block_size:<8} {cfg.vocab_size:<8} {cfg.batch_size:<8} {cfg.approx_params:<10} {cfg.estimated_ram_gb:.0f}GB")
print("=" * 85)
print("🎯 = Recommended configuration for M2 Max (64GB)")
print("⚠️ xlarge may cause OOM on MPS, use CUDA or reduce batch")
if __name__ == '__main__':
print_configs()
if __name__ == '__main__':
print_configs()