RippleGPT-Nano / validation /qa /model_configs.py

Upload folder using huggingface_hub

148b631 verified 3 months ago

4.37 kB

	"""
	model_configs.py - Model configurations for Q&A Test with FineWeb-Edu.

	Defines larger models (up to 350M) optimized for M2 Max with 64GB RAM.
	Includes expanded vocabulary (32K-50K) for better efficiency.
	"""

	from dataclasses import dataclass
	from typing import Dict


	@dataclass
	class QAModelConfig:
	"""Configuration for a RippleGPT model for Q&A."""
	name: str
	n_layer: int
	n_head: int
	n_embd: int
	block_size: int
	dropout: float = 0.1
	vocab_size: int = 32000 # Default BPE tokenizer
	batch_size: int = 32 # Optimized for M2 Max
	attention_window: int = None # RFC-001: Sliding window (None = full attention)

	@property
	def approx_params(self) -> str:
	"""Rough parameter estimation."""
	# Approximate formula: 12 * n_layer * n_embd^2
	params = 12 * self.n_layer * (self.n_embd ** 2)
	if params >= 1e9:
	return f"{params/1e9:.1f}B"
	elif params >= 1e6:
	return f"{params/1e6:.0f}M"
	else:
	return f"{params/1e3:.0f}K"

	@property
	def estimated_ram_gb(self) -> float:
	"""Estimated RAM usage in GB during training."""
	# Approximate formula considering gradients and activations
	params = 12 * self.n_layer * (self.n_embd ** 2)
	# Model + gradients + activations ≈ 20x parameters
	bytes_needed = params * 4 * 20 # float32
	# Add batch memory
	batch_mem = self.batch_size * self.block_size * self.n_embd * 4
	return (bytes_needed + batch_mem) / 1e9


	# ============================================================================
	# MODEL CONFIGURATIONS - OPTIMIZED FOR M2 MAX (64GB)
	# ============================================================================

	# Small model for quick tests
	SMALL_CONFIG = QAModelConfig(
	name="small",
	n_layer=8,
	n_head=8,
	n_embd=512,
	block_size=512,
	dropout=0.15,
	vocab_size=32000,
	batch_size=64
	)

	# Medium model for validation
	MEDIUM_CONFIG = QAModelConfig(
	name="medium",
	n_layer=12,
	n_head=12,
	n_embd=768,
	block_size=1024,
	dropout=0.1,
	vocab_size=32000,
	batch_size=32
	)

	# 🎯 TARGET MODEL: ~150M params - FITS IN MPS 64GB!
	# Reduced from 24 to 12 layers to fit in memory
	# Uses sliding window attention for efficiency
	# NOTE: n_embd=1056 (12×88) to be divisible by n_head
	LARGE_CONFIG = QAModelConfig(
	name="large",
	n_layer=12, # Reduced to fit MPS memory
	n_head=12, # Proportionally reduced
	n_embd=1056, # 12 × 88 = 1056 (divisible!)
	block_size=1024,
	dropout=0.1,
	vocab_size=32000,
	batch_size=16,
	attention_window=512 # Sliding window to save memory
	)

	# Extra large model (~300M, formerly large) - MAY OOM!
	# Use only if you have 96GB+ RAM or dedicated GPU
	XLARGE_CONFIG = QAModelConfig(
	name="xlarge",
	n_layer=24,
	n_head=16,
	n_embd=1024,
	block_size=1024,
	dropout=0.1,
	vocab_size=32000,
	batch_size=8, # Too small to try fitting
	attention_window=256 # Very aggressive sliding window
	)


	# Mapping by name
	CONFIGS: Dict[str, QAModelConfig] = {
	"small": SMALL_CONFIG,
	"medium": MEDIUM_CONFIG,
	"large": LARGE_CONFIG,
	"xlarge": XLARGE_CONFIG
	}


	def get_config(name: str) -> QAModelConfig:
	"""Returns configuration by name."""
	if name not in CONFIGS:
	raise ValueError(f"Config '{name}' not found. Options: {list(CONFIGS.keys())}")
	return CONFIGS[name]


	def print_configs():
	"""Prints all available configurations."""
	print("\n📋 Model Configurations for Q&A (FineWeb-Edu):")
	print("=" * 85)
	print(f"{'Name':<10} {'Layers':<8} {'Heads':<8} {'Embd':<8} {'Block':<8} {'Vocab':<8} {'Batch':<8} {'~Params':<10} {'~RAM':<8}")
	print("-" * 85)

	for name, cfg in CONFIGS.items():
	marker = "🎯" if name == "large" else " "
	print(f"{marker}{cfg.name:<8} {cfg.n_layer:<8} {cfg.n_head:<8} {cfg.n_embd:<8} {cfg.block_size:<8} {cfg.vocab_size:<8} {cfg.batch_size:<8} {cfg.approx_params:<10} {cfg.estimated_ram_gb:.0f}GB")

	print("=" * 85)
	print("🎯 = Recommended configuration for M2 Max (64GB)")
	print("⚠️ xlarge may cause OOM on MPS, use CUDA or reduce batch")


	if __name__ == '__main__':
	print_configs()


	if __name__ == '__main__':
	print_configs()