Spaces:
Paused
Paused
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| class FOGConfig: | |
| # shared | |
| vocab_size: int = 512 | |
| d_model: int = 256 | |
| n_layers: int = 6 | |
| n_heads: int = 4 | |
| max_seq_len: int = 256 | |
| dropout: float = 0.1 | |
| # baseline FFN | |
| d_ff: int = 1024 | |
| # motif-aware attention | |
| d_compare: int = 64 | |
| d_memory: int = 192 | |
| # motif-aware FFN | |
| d_expand: int = 512 | |
| d_gate: int = 32 | |
| BASELINE_SMALL = FOGConfig() | |
| MOTIF_SMALL = FOGConfig( | |
| d_compare=64, | |
| d_memory=192, | |
| d_expand=512, | |
| d_gate=32, | |
| ) | |
| # Param-matched uniform baseline for controlled comparison | |
| # d_model=94, d_ff=376 β ~432K params to match MOTIF_TINY | |
| UNIFORM_TINY = FOGConfig( | |
| vocab_size=32, | |
| d_model=94, | |
| n_layers=4, | |
| n_heads=2, | |
| max_seq_len=32, | |
| d_ff=376, | |
| ) | |
| # Tiny configs for fast iteration | |
| BASELINE_TINY = FOGConfig( | |
| vocab_size=32, | |
| d_model=128, | |
| n_layers=4, | |
| n_heads=4, | |
| max_seq_len=32, | |
| d_ff=512, | |
| ) | |
| MOTIF_TINY = FOGConfig( | |
| vocab_size=32, | |
| d_model=128, | |
| n_layers=4, | |
| n_heads=4, | |
| max_seq_len=32, | |
| d_ff=512, | |
| d_compare=32, | |
| d_memory=96, | |
| d_expand=256, | |
| d_gate=16, | |
| ) | |
| # ββ Micro configs: models at capacity boundary ββββββββββββββββ | |
| # ~5-10K params β both architectures struggle, differences emerge | |
| BASELINE_MICRO = FOGConfig( | |
| vocab_size=64, | |
| d_model=48, | |
| n_layers=3, | |
| n_heads=2, | |
| max_seq_len=64, | |
| dropout=0.0, | |
| d_ff=96, | |
| ) | |
| MOTIF_MICRO = FOGConfig( | |
| vocab_size=64, | |
| d_model=48, | |
| n_layers=3, | |
| n_heads=2, | |
| max_seq_len=64, | |
| dropout=0.0, | |
| d_ff=96, | |
| d_compare=12, # narrow: precise key matching (6 per head) | |
| d_memory=36, # wide: value storage (18 per head) | |
| d_expand=72, | |
| d_gate=12, # thin: control path | |
| ) | |
| # Param-matched uniform baseline for micro | |
| # d_model=42, d_ff=74 β 46,632 params (exact match with MOTIF_MICRO) | |
| UNIFORM_MICRO = FOGConfig( | |
| vocab_size=64, | |
| d_model=42, | |
| n_layers=3, | |
| n_heads=2, | |
| max_seq_len=64, | |
| dropout=0.0, | |
| d_ff=74, | |
| ) | |
| # ββ Medium configs: 400-800K params, hard tasks βββββββββββββββ | |
| # vocab=256, seq=128 β enough combinatorial diversity to stress models | |
| BASELINE_MED = FOGConfig( | |
| vocab_size=256, | |
| d_model=128, | |
| n_layers=4, | |
| n_heads=4, | |
| max_seq_len=128, | |
| dropout=0.05, | |
| d_ff=512, | |
| ) | |
| MOTIF_MED = FOGConfig( | |
| vocab_size=256, | |
| d_model=128, | |
| n_layers=4, | |
| n_heads=4, | |
| max_seq_len=128, | |
| dropout=0.05, | |
| d_ff=512, | |
| d_compare=32, # narrow: 8 per head | |
| d_memory=96, # wide: 24 per head | |
| d_expand=256, | |
| d_gate=16, | |
| ) | |
| # Param-matched uniform for med | |
| # d_model=96, d_ff=369 β ~473K params (matches MOTIF_MED) | |
| UNIFORM_MED = FOGConfig( | |
| vocab_size=256, | |
| d_model=96, | |
| n_layers=4, | |
| n_heads=4, | |
| max_seq_len=128, | |
| dropout=0.05, | |
| d_ff=369, | |
| ) | |