from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class FOGConfig:
    # shared
    vocab_size: int = 512
    d_model: int = 256
    n_layers: int = 6
    n_heads: int = 4
    max_seq_len: int = 256
    dropout: float = 0.1

    # baseline FFN
    d_ff: int = 1024

    # motif-aware attention
    d_compare: int = 64
    d_memory: int = 192

    # motif-aware FFN
    d_expand: int = 512
    d_gate: int = 32


BASELINE_SMALL = FOGConfig()

MOTIF_SMALL = FOGConfig(
    d_compare=64,
    d_memory=192,
    d_expand=512,
    d_gate=32,
)

# Param-matched uniform baseline for controlled comparison
# d_model=94, d_ff=376 → ~432K params to match MOTIF_TINY
UNIFORM_TINY = FOGConfig(
    vocab_size=32,
    d_model=94,
    n_layers=4,
    n_heads=2,
    max_seq_len=32,
    d_ff=376,
)

# Tiny configs for fast iteration
BASELINE_TINY = FOGConfig(
    vocab_size=32,
    d_model=128,
    n_layers=4,
    n_heads=4,
    max_seq_len=32,
    d_ff=512,
)

MOTIF_TINY = FOGConfig(
    vocab_size=32,
    d_model=128,
    n_layers=4,
    n_heads=4,
    max_seq_len=32,
    d_ff=512,
    d_compare=32,
    d_memory=96,
    d_expand=256,
    d_gate=16,
)

# ── Micro configs: models at capacity boundary ────────────────
# ~5-10K params — both architectures struggle, differences emerge

BASELINE_MICRO = FOGConfig(
    vocab_size=64,
    d_model=48,
    n_layers=3,
    n_heads=2,
    max_seq_len=64,
    dropout=0.0,
    d_ff=96,
)

MOTIF_MICRO = FOGConfig(
    vocab_size=64,
    d_model=48,
    n_layers=3,
    n_heads=2,
    max_seq_len=64,
    dropout=0.0,
    d_ff=96,
    d_compare=12,   # narrow: precise key matching (6 per head)
    d_memory=36,    # wide: value storage (18 per head)
    d_expand=72,
    d_gate=12,      # thin: control path
)

# Param-matched uniform baseline for micro
# d_model=42, d_ff=74 → 46,632 params (exact match with MOTIF_MICRO)
UNIFORM_MICRO = FOGConfig(
    vocab_size=64,
    d_model=42,
    n_layers=3,
    n_heads=2,
    max_seq_len=64,
    dropout=0.0,
    d_ff=74,
)

# ── Medium configs: 400-800K params, hard tasks ───────────────
# vocab=256, seq=128 — enough combinatorial diversity to stress models

BASELINE_MED = FOGConfig(
    vocab_size=256,
    d_model=128,
    n_layers=4,
    n_heads=4,
    max_seq_len=128,
    dropout=0.05,
    d_ff=512,
)

MOTIF_MED = FOGConfig(
    vocab_size=256,
    d_model=128,
    n_layers=4,
    n_heads=4,
    max_seq_len=128,
    dropout=0.05,
    d_ff=512,
    d_compare=32,   # narrow: 8 per head
    d_memory=96,    # wide: 24 per head
    d_expand=256,
    d_gate=16,
)

# Param-matched uniform for med
# d_model=96, d_ff=369 → ~473K params (matches MOTIF_MED)
UNIFORM_MED = FOGConfig(
    vocab_size=256,
    d_model=96,
    n_layers=4,
    n_heads=4,
    max_seq_len=128,
    dropout=0.05,
    d_ff=369,
)