| """ |
| model_configs.py - Model configurations for Q&A Test with FineWeb-Edu. |
| |
| Defines larger models (up to 350M) optimized for M2 Max with 64GB RAM. |
| Includes expanded vocabulary (32K-50K) for better efficiency. |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import Dict |
|
|
|
|
| @dataclass |
| class QAModelConfig: |
| """Configuration for a RippleGPT model for Q&A.""" |
| name: str |
| n_layer: int |
| n_head: int |
| n_embd: int |
| block_size: int |
| dropout: float = 0.1 |
| vocab_size: int = 32000 |
| batch_size: int = 32 |
| attention_window: int = None |
| |
| @property |
| def approx_params(self) -> str: |
| """Rough parameter estimation.""" |
| |
| params = 12 * self.n_layer * (self.n_embd ** 2) |
| if params >= 1e9: |
| return f"{params/1e9:.1f}B" |
| elif params >= 1e6: |
| return f"{params/1e6:.0f}M" |
| else: |
| return f"{params/1e3:.0f}K" |
| |
| @property |
| def estimated_ram_gb(self) -> float: |
| """Estimated RAM usage in GB during training.""" |
| |
| params = 12 * self.n_layer * (self.n_embd ** 2) |
| |
| bytes_needed = params * 4 * 20 |
| |
| batch_mem = self.batch_size * self.block_size * self.n_embd * 4 |
| return (bytes_needed + batch_mem) / 1e9 |
|
|
|
|
| |
| |
| |
|
|
| |
| SMALL_CONFIG = QAModelConfig( |
| name="small", |
| n_layer=8, |
| n_head=8, |
| n_embd=512, |
| block_size=512, |
| dropout=0.15, |
| vocab_size=32000, |
| batch_size=64 |
| ) |
|
|
| |
| MEDIUM_CONFIG = QAModelConfig( |
| name="medium", |
| n_layer=12, |
| n_head=12, |
| n_embd=768, |
| block_size=1024, |
| dropout=0.1, |
| vocab_size=32000, |
| batch_size=32 |
| ) |
|
|
| |
| |
| |
| |
| LARGE_CONFIG = QAModelConfig( |
| name="large", |
| n_layer=12, |
| n_head=12, |
| n_embd=1056, |
| block_size=1024, |
| dropout=0.1, |
| vocab_size=32000, |
| batch_size=16, |
| attention_window=512 |
| ) |
|
|
| |
| |
| XLARGE_CONFIG = QAModelConfig( |
| name="xlarge", |
| n_layer=24, |
| n_head=16, |
| n_embd=1024, |
| block_size=1024, |
| dropout=0.1, |
| vocab_size=32000, |
| batch_size=8, |
| attention_window=256 |
| ) |
|
|
|
|
| |
| CONFIGS: Dict[str, QAModelConfig] = { |
| "small": SMALL_CONFIG, |
| "medium": MEDIUM_CONFIG, |
| "large": LARGE_CONFIG, |
| "xlarge": XLARGE_CONFIG |
| } |
|
|
|
|
| def get_config(name: str) -> QAModelConfig: |
| """Returns configuration by name.""" |
| if name not in CONFIGS: |
| raise ValueError(f"Config '{name}' not found. Options: {list(CONFIGS.keys())}") |
| return CONFIGS[name] |
|
|
|
|
| def print_configs(): |
| """Prints all available configurations.""" |
| print("\n📋 Model Configurations for Q&A (FineWeb-Edu):") |
| print("=" * 85) |
| print(f"{'Name':<10} {'Layers':<8} {'Heads':<8} {'Embd':<8} {'Block':<8} {'Vocab':<8} {'Batch':<8} {'~Params':<10} {'~RAM':<8}") |
| print("-" * 85) |
| |
| for name, cfg in CONFIGS.items(): |
| marker = "🎯" if name == "large" else " " |
| print(f"{marker}{cfg.name:<8} {cfg.n_layer:<8} {cfg.n_head:<8} {cfg.n_embd:<8} {cfg.block_size:<8} {cfg.vocab_size:<8} {cfg.batch_size:<8} {cfg.approx_params:<10} {cfg.estimated_ram_gb:.0f}GB") |
| |
| print("=" * 85) |
| print("🎯 = Recommended configuration for M2 Max (64GB)") |
| print("⚠️ xlarge may cause OOM on MPS, use CUDA or reduce batch") |
|
|
|
|
| if __name__ == '__main__': |
| print_configs() |
|
|
|
|
| if __name__ == '__main__': |
| print_configs() |
|
|