from __future__ import annotations from dataclasses import dataclass @dataclass(frozen=True) class FOGConfig: # shared vocab_size: int = 512 d_model: int = 256 n_layers: int = 6 n_heads: int = 4 max_seq_len: int = 256 dropout: float = 0.1 # baseline FFN d_ff: int = 1024 # motif-aware attention d_compare: int = 64 d_memory: int = 192 # motif-aware FFN d_expand: int = 512 d_gate: int = 32 BASELINE_SMALL = FOGConfig() MOTIF_SMALL = FOGConfig( d_compare=64, d_memory=192, d_expand=512, d_gate=32, ) # Param-matched uniform baseline for controlled comparison # d_model=94, d_ff=376 → ~432K params to match MOTIF_TINY UNIFORM_TINY = FOGConfig( vocab_size=32, d_model=94, n_layers=4, n_heads=2, max_seq_len=32, d_ff=376, ) # Tiny configs for fast iteration BASELINE_TINY = FOGConfig( vocab_size=32, d_model=128, n_layers=4, n_heads=4, max_seq_len=32, d_ff=512, ) MOTIF_TINY = FOGConfig( vocab_size=32, d_model=128, n_layers=4, n_heads=4, max_seq_len=32, d_ff=512, d_compare=32, d_memory=96, d_expand=256, d_gate=16, ) # ── Micro configs: models at capacity boundary ──────────────── # ~5-10K params — both architectures struggle, differences emerge BASELINE_MICRO = FOGConfig( vocab_size=64, d_model=48, n_layers=3, n_heads=2, max_seq_len=64, dropout=0.0, d_ff=96, ) MOTIF_MICRO = FOGConfig( vocab_size=64, d_model=48, n_layers=3, n_heads=2, max_seq_len=64, dropout=0.0, d_ff=96, d_compare=12, # narrow: precise key matching (6 per head) d_memory=36, # wide: value storage (18 per head) d_expand=72, d_gate=12, # thin: control path ) # Param-matched uniform baseline for micro # d_model=42, d_ff=74 → 46,632 params (exact match with MOTIF_MICRO) UNIFORM_MICRO = FOGConfig( vocab_size=64, d_model=42, n_layers=3, n_heads=2, max_seq_len=64, dropout=0.0, d_ff=74, ) # ── Medium configs: 400-800K params, hard tasks ─────────────── # vocab=256, seq=128 — enough combinatorial diversity to stress models BASELINE_MED = FOGConfig( vocab_size=256, d_model=128, n_layers=4, n_heads=4, max_seq_len=128, dropout=0.05, d_ff=512, ) MOTIF_MED = FOGConfig( vocab_size=256, d_model=128, n_layers=4, n_heads=4, max_seq_len=128, dropout=0.05, d_ff=512, d_compare=32, # narrow: 8 per head d_memory=96, # wide: 24 per head d_expand=256, d_gate=16, ) # Param-matched uniform for med # d_model=96, d_ff=369 → ~473K params (matches MOTIF_MED) UNIFORM_MED = FOGConfig( vocab_size=256, d_model=96, n_layers=4, n_heads=4, max_seq_len=128, dropout=0.05, d_ff=369, )