abpt / src /fog /config.py
Search
sync: FOG micro+medium configs, stress tasks, fast pipeline
6ef010e
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class FOGConfig:
# shared
vocab_size: int = 512
d_model: int = 256
n_layers: int = 6
n_heads: int = 4
max_seq_len: int = 256
dropout: float = 0.1
# baseline FFN
d_ff: int = 1024
# motif-aware attention
d_compare: int = 64
d_memory: int = 192
# motif-aware FFN
d_expand: int = 512
d_gate: int = 32
BASELINE_SMALL = FOGConfig()
MOTIF_SMALL = FOGConfig(
d_compare=64,
d_memory=192,
d_expand=512,
d_gate=32,
)
# Param-matched uniform baseline for controlled comparison
# d_model=94, d_ff=376 β†’ ~432K params to match MOTIF_TINY
UNIFORM_TINY = FOGConfig(
vocab_size=32,
d_model=94,
n_layers=4,
n_heads=2,
max_seq_len=32,
d_ff=376,
)
# Tiny configs for fast iteration
BASELINE_TINY = FOGConfig(
vocab_size=32,
d_model=128,
n_layers=4,
n_heads=4,
max_seq_len=32,
d_ff=512,
)
MOTIF_TINY = FOGConfig(
vocab_size=32,
d_model=128,
n_layers=4,
n_heads=4,
max_seq_len=32,
d_ff=512,
d_compare=32,
d_memory=96,
d_expand=256,
d_gate=16,
)
# ── Micro configs: models at capacity boundary ────────────────
# ~5-10K params β€” both architectures struggle, differences emerge
BASELINE_MICRO = FOGConfig(
vocab_size=64,
d_model=48,
n_layers=3,
n_heads=2,
max_seq_len=64,
dropout=0.0,
d_ff=96,
)
MOTIF_MICRO = FOGConfig(
vocab_size=64,
d_model=48,
n_layers=3,
n_heads=2,
max_seq_len=64,
dropout=0.0,
d_ff=96,
d_compare=12, # narrow: precise key matching (6 per head)
d_memory=36, # wide: value storage (18 per head)
d_expand=72,
d_gate=12, # thin: control path
)
# Param-matched uniform baseline for micro
# d_model=42, d_ff=74 β†’ 46,632 params (exact match with MOTIF_MICRO)
UNIFORM_MICRO = FOGConfig(
vocab_size=64,
d_model=42,
n_layers=3,
n_heads=2,
max_seq_len=64,
dropout=0.0,
d_ff=74,
)
# ── Medium configs: 400-800K params, hard tasks ───────────────
# vocab=256, seq=128 β€” enough combinatorial diversity to stress models
BASELINE_MED = FOGConfig(
vocab_size=256,
d_model=128,
n_layers=4,
n_heads=4,
max_seq_len=128,
dropout=0.05,
d_ff=512,
)
MOTIF_MED = FOGConfig(
vocab_size=256,
d_model=128,
n_layers=4,
n_heads=4,
max_seq_len=128,
dropout=0.05,
d_ff=512,
d_compare=32, # narrow: 8 per head
d_memory=96, # wide: 24 per head
d_expand=256,
d_gate=16,
)
# Param-matched uniform for med
# d_model=96, d_ff=369 β†’ ~473K params (matches MOTIF_MED)
UNIFORM_MED = FOGConfig(
vocab_size=256,
d_model=96,
n_layers=4,
n_heads=4,
max_seq_len=128,
dropout=0.05,
d_ff=369,
)