Spaces:

kharki
/

abpt

Paused

abpt / src /fog /config.py

sync: FOG micro+medium configs, stress tasks, fast pipeline

6ef010e 2 months ago

2.92 kB

	from __future__ import annotations

	from dataclasses import dataclass


	@dataclass(frozen=True)
	class FOGConfig:
	# shared
	vocab_size: int = 512
	d_model: int = 256
	n_layers: int = 6
	n_heads: int = 4
	max_seq_len: int = 256
	dropout: float = 0.1

	# baseline FFN
	d_ff: int = 1024

	# motif-aware attention
	d_compare: int = 64
	d_memory: int = 192

	# motif-aware FFN
	d_expand: int = 512
	d_gate: int = 32


	BASELINE_SMALL = FOGConfig()

	MOTIF_SMALL = FOGConfig(
	d_compare=64,
	d_memory=192,
	d_expand=512,
	d_gate=32,
	)

	# Param-matched uniform baseline for controlled comparison
	# d_model=94, d_ff=376 → ~432K params to match MOTIF_TINY
	UNIFORM_TINY = FOGConfig(
	vocab_size=32,
	d_model=94,
	n_layers=4,
	n_heads=2,
	max_seq_len=32,
	d_ff=376,
	)

	# Tiny configs for fast iteration
	BASELINE_TINY = FOGConfig(
	vocab_size=32,
	d_model=128,
	n_layers=4,
	n_heads=4,
	max_seq_len=32,
	d_ff=512,
	)

	MOTIF_TINY = FOGConfig(
	vocab_size=32,
	d_model=128,
	n_layers=4,
	n_heads=4,
	max_seq_len=32,
	d_ff=512,
	d_compare=32,
	d_memory=96,
	d_expand=256,
	d_gate=16,
	)

	# ── Micro configs: models at capacity boundary ────────────────
	# ~5-10K params — both architectures struggle, differences emerge

	BASELINE_MICRO = FOGConfig(
	vocab_size=64,
	d_model=48,
	n_layers=3,
	n_heads=2,
	max_seq_len=64,
	dropout=0.0,
	d_ff=96,
	)

	MOTIF_MICRO = FOGConfig(
	vocab_size=64,
	d_model=48,
	n_layers=3,
	n_heads=2,
	max_seq_len=64,
	dropout=0.0,
	d_ff=96,
	d_compare=12, # narrow: precise key matching (6 per head)
	d_memory=36, # wide: value storage (18 per head)
	d_expand=72,
	d_gate=12, # thin: control path
	)

	# Param-matched uniform baseline for micro
	# d_model=42, d_ff=74 → 46,632 params (exact match with MOTIF_MICRO)
	UNIFORM_MICRO = FOGConfig(
	vocab_size=64,
	d_model=42,
	n_layers=3,
	n_heads=2,
	max_seq_len=64,
	dropout=0.0,
	d_ff=74,
	)

	# ── Medium configs: 400-800K params, hard tasks ───────────────
	# vocab=256, seq=128 — enough combinatorial diversity to stress models

	BASELINE_MED = FOGConfig(
	vocab_size=256,
	d_model=128,
	n_layers=4,
	n_heads=4,
	max_seq_len=128,
	dropout=0.05,
	d_ff=512,
	)

	MOTIF_MED = FOGConfig(
	vocab_size=256,
	d_model=128,
	n_layers=4,
	n_heads=4,
	max_seq_len=128,
	dropout=0.05,
	d_ff=512,
	d_compare=32, # narrow: 8 per head
	d_memory=96, # wide: 24 per head
	d_expand=256,
	d_gate=16,
	)

	# Param-matched uniform for med
	# d_model=96, d_ff=369 → ~473K params (matches MOTIF_MED)
	UNIFORM_MED = FOGConfig(
	vocab_size=256,
	d_model=96,
	n_layers=4,
	n_heads=4,
	max_seq_len=128,
	dropout=0.05,
	d_ff=369,
	)