Initial release: SRT-Adapter v8a (peer-review distribution)

aa2d4f1 verified about 1 month ago

6.36 kB

	"""Configuration dataclasses for SRT Adapter."""

	from __future__ import annotations

	from dataclasses import dataclass, field


	@dataclass
	class MAHConfig:
	"""Metapragmatic Attention Head configuration."""

	d_sub: int = 512 # semiotic subspace dimension
	d_divergence: int = 256 # divergence vector dimension
	num_heads: int = 4 # attention heads
	dropout: float = 0.1


	@dataclass
	class RRMConfig:
	"""Reflexive Recurrent Module configuration."""

	d_meta: int = 512 # GRU meta-state dimension
	inject_scale: float = 1.0 # FiLM correction scale (v3 used 0.1 with linear inject; v4 uses 1.0 with FiLM)


	@dataclass
	class BENConfig:
	"""Bifurcation Estimation Network configuration."""

	d_hidden: int = 256 # MLP hidden dimension


	@dataclass
	class CommunityConfig:
	"""Unsupervised community discovery configuration."""

	num_prototypes: int = 32 # number of soft community clusters
	d_community: int = 64 # community embedding dimension
	temperature: float = 1.0 # softmax temperature for assignment
	# v8a: when False, skip the discrete prototype basis entirely; the
	# encoder output IS the community vector. Motivated by the v7 PCA
	# finding that prototype tensors barely move from random init across
	# v5/v6/v7 (mean abs delta ~3e-5) — the encoder was already doing all
	# the discriminative work and the prototype-mixing readout was
	# discarding information at the soft-argmax. With use_prototypes=False
	# the community channel becomes a continuous 64-D coordinate rather
	# than a soft assignment over K anchors.
	#
	# Env override: set SRT_USE_PROTOTYPES=0 (or "false") to flip this off
	# globally. Lets probe / eval scripts run against v8a checkpoints
	# without per-script flag plumbing.
	use_prototypes: bool = True

	def __post_init__(self) -> None:
	import os
	v = os.environ.get("SRT_USE_PROTOTYPES")
	if v is not None and v.lower() in ("0", "false", "no", "off"):
	self.use_prototypes = False


	@dataclass
	class LossConfig:
	"""Loss weights."""

	ce_weight: float = 1.0
	chain_weight: float = 0.5 # divergence chain prediction
	bif_weight: float = 1.0 # bifurcation (r_hat vs r_true)
	regime_weight: float = 5.0 # regime classification
	div_alive_weight: float = 0.1 # prevent divergence collapse
	# v4: dropped to 0 because v3 ablation showed the inject-norm regularizer
	# was driving the optimizer to satisfy \|\|inj\|\|=1 with arbitrary directions
	# rather than directions useful for downstream loss. FiLM init handles
	# gradient flow without needing a norm prior.
	inject_reg_weight: float = 0.0
	inject_target_norm: float = 1.0
	community_entropy_weight: float = 0.01 # diverse community usage
	# v4/v5: SupCon loss on community ENCODER output keyed by source-id
	# hash. Forces prototypes apart by giving same-source pairs positive
	# gradient and different-source pairs negative gradient through the
	# encoder. v5 raised the weight 0.5 -> 2.0 because v4's signal at 0.5
	# was overwhelmed and the loss flatlined at log(B-1)=2.71.
	community_supcon_weight: float = 2.0
	community_supcon_temperature: float = 0.1
	# v6 additions:
	# - divergence SupCon on mean-pooled last-MAH divergence (analog of v5
	# community SupCon, applied to the metapragmatic channel)
	# - ListNet ranking loss on r̂ within each sequence (sharpens ordering;
	# pointwise smooth-L1 alone tolerates large rank errors at the tails)
	# - chain-residual auxiliary floor: keeps inference signal alive after
	# chain_loss has driven the per-position residual near zero
	divergence_supcon_weight: float = 1.0
	divergence_supcon_temperature: float = 0.1
	listnet_weight: float = 0.5
	listnet_temperature: float = 1.0
	chain_residual_aux_weight: float = 0.05
	chain_residual_aux_target: float = 0.5
	# v9: supervised contrastive loss keyed by archetype_id, applied to the
	# same `community_output.encoded` representation as community_supcon. The
	# 33 archetypes (Lancaster, paired with the Lexicon of Synthetic
	# Interiority) are an external taxonomy that has only been a held-out
	# probe through v8b. v9 promotes them to a training signal alongside
	# Reddit subreddit ids. Rows whose archetype_id == -1 (Reddit corpus) are
	# masked out of this loss; rows from the archetype-generations corpus
	# carry archetype_id ∈ [1, 33] and contribute positive pairs.
	archetype_supcon_weight: float = 0.0
	archetype_supcon_temperature: float = 0.1


	@dataclass
	class TrainingConfig:
	"""Training hyperparameters."""

	lr: float = 3e-4
	weight_decay: float = 0.01
	epochs: int = 3
	batch_size: int = 16
	max_seq_len: int = 512
	val_every: int = 1000
	log_every: int = 100
	patience: int = 5
	warmup_steps: int = 500
	grad_clip: float = 1.0


	@dataclass
	class SRTConfig:
	"""Top-level SRT Adapter configuration."""

	backbone_id: str = "Qwen/Qwen2.5-7B"
	backbone_dtype: str = "bfloat16"

	# Layer hook indices — empty means auto-compute from backbone depth
	mah_layer_indices: list[int] = field(default_factory=list)
	rrm_inject_indices: list[int] = field(default_factory=list)
	community_layer_idx: int = -1 # -1 = auto

	num_mah_layers: int = 3

	mah: MAHConfig = field(default_factory=MAHConfig)
	rrm: RRMConfig = field(default_factory=RRMConfig)
	ben: BENConfig = field(default_factory=BENConfig)
	community: CommunityConfig = field(default_factory=CommunityConfig)
	loss: LossConfig = field(default_factory=LossConfig)
	training: TrainingConfig = field(default_factory=TrainingConfig)

	def resolve_layer_indices(self, num_layers: int) -> None:
	"""Auto-compute layer indices from backbone depth if not set."""
	if not self.mah_layer_indices:
	step = num_layers // (self.num_mah_layers + 1)
	self.mah_layer_indices = [step * (i + 1) for i in range(self.num_mah_layers)]
	if not self.rrm_inject_indices:
	# Inject at all MAH layers except the first (let meta-state build up)
	self.rrm_inject_indices = self.mah_layer_indices[1:]
	if self.community_layer_idx < 0:
	self.community_layer_idx = max(1, num_layers // 7)